1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Andreas Gogol-Doering <doering@mdc-berlin.de>
33 // ==========================================================================
34 // Support for writing and reading FASTA alignment files.
35 // ==========================================================================
36 
37 #ifndef SEQAN_FILE_FILE_FORMAT_FASTA_ALIGN_H_
38 #define SEQAN_FILE_FILE_FORMAT_FASTA_ALIGN_H_
39 
40 namespace seqan {
41 
42 // ===========================================================================
43 // Forward Declarations
44 // ===========================================================================
45 
46 //forward declarations
47 template <typename T>
48 struct Row;
49 
50 template <typename T>
51 struct Rows;
52 
53 // ===========================================================================
54 // Tags, Enums, Classes, Specializations
55 // ===========================================================================
56 
57 /**
58 .Tag.File Format.tag.Fasta alignment:
59 	FASTA alignment file format for sequences.
60 ..include:seqan/file.h
61 */
62 struct FastaAlign_;
63 typedef Tag<FastaAlign_> FastaAlign;
64 
65 // ===========================================================================
66 // Metafunctions
67 // ===========================================================================
68 
69 // ===========================================================================
70 // Functions
71 // ===========================================================================
72 
73 template <typename TFile, typename TSize>
_fastaAlignScanLine(TFile & file,TSize & count)74 void _fastaAlignScanLine(TFile & file, TSize & count) {
75 
76 	SEQAN_CHECKPOINT;
77 	SEQAN_ASSERT(!_streamEOF(file))
78 
79 	while (true) {
80 		typename Value<TFile>::Type c = _streamGet(file);
81 
82 		if (_streamEOF(file)) return;
83 		if (c == '\n') return;
84 
85 		if ((c != '\r') && (c!='-'))
86 			++count;
87 	}
88 }
89 
90 //////////////////////////////////////////////////////////////////////////////
91 // read
92 //////////////////////////////////////////////////////////////////////////////
93 template <typename TFile, typename TSource, typename TSpec>
read(TFile & file,Align<TSource,TSpec> & align,FastaAlign const &)94 void read(TFile & file, Align<TSource, TSpec> & align, FastaAlign const &) {
95     SEQAN_CHECKPOINT;
96 
97 	SEQAN_ASSERT_NOT(_streamEOF(file));
98 
99 	typedef typename Value<TSource>::Type TSourceValue;
100 	typedef typename Size<TSourceValue>::Type TSize;
101 	typedef typename Position<TFile>::Type TFilePos;
102 	typedef Triple<TFilePos, TFilePos, TSize> TTriple;
103 	TSize limit = maxValue<TSize>();
104 
105 	//Determine begin position, end position and length of each sequence
106 	String<TTriple> beg_end_length;
107 
108 	TFilePos begin_pos;
109 	TFilePos end_pos;
110 	typename Value<TFile>::Type c;
111 	TSize count;
112 
113 	while (!_streamEOF(file)) {
114 		begin_pos = _streamTellG(file);
115 		count = 0;
116 		SEQAN_ASSERT_NOT(_streamEOF(file));
117 
118 
119 		c = _streamGet(file);
120 
121 		// Skip id
122 		if (c == '>') {
123 			_fastaAlignScanLine(file, count);
124 			begin_pos = _streamTellG(file);
125 			count = 0;
126 		} else {  //If no id first letter belongs to sequence
127 			count = 1;
128 		}
129 
130 		// Count letters
131 		while (true) {
132 			_fastaAlignScanLine(file, count);
133 
134 			typename Value<TFile>::Type c = _streamGet(file);
135 			if (c == '>') {
136 				_streamSeek2G(file, -1);
137 				end_pos = _streamTellG(file);
138 				break;
139 			}
140 			if (_streamEOF(file)) {
141 				end_pos = _streamTellG(file);
142 				break;
143 			}
144 			if ((c != '\n') && (c != '\r') && (c!='-'))	{
145 				++count;
146 			}
147 		}
148 		if (count > limit) {
149 			count = limit;
150 		}
151 
152 		appendValue(beg_end_length, TTriple(begin_pos, end_pos, count));
153 	}
154 
155 	// Resize alignment data structure
156 	TSize numRows=length(beg_end_length);
157 	resize(rows(align), numRows);	//rows
158 
159 	typedef Align<TSource, TSpec> TAlign;
160 	typedef typename Row<TAlign>::Type TRow;
161 
162 	for(TSize i=0;i<numRows;++i) {
163 		TSize begin = beg_end_length[i].i1;
164 //		TSize end = beg_end_length[i].i2;
165 		count = beg_end_length[i].i3;
166 
167 		//Reserve space
168 		clear(row(align,i));
169 		createSource(row(align,i));
170 		resize(source(row(align,i)),count);
171 		if (length(source(row(align,i))) < count) {
172 			count = length(source(row(align,i)));
173 		}
174 		setClippedEndPosition(row(align,i),count);
175 
176 		//Read sequence
177 		_streamSeekG(file, begin);
178 
179 		typename Position<TSource>::Type pos;
180 		for (pos = 0; pos < count; ) {
181 			c = _streamGet(file);
182 			if ((c != '\n') && (c != '\r') && (c != '-'))	{
183 				source(row(align,i))[pos] = c;
184 				++pos;
185 			}
186 			if (c=='-') {
187 				insertGap(row(align,i), toViewPosition(row(align,i), pos));
188 			}
189 		}
190 	}
191 
192 	_streamSeekG(file, 0);
193 }
194 
195 //////////////////////////////////////////////////////////////////////////////
196 // readIDs
197 //////////////////////////////////////////////////////////////////////////////
198 
199 template <typename TFile, typename TStringContainer>
readIDs(TFile & file,TStringContainer & ids,FastaAlign)200 void readIDs(TFile& file, TStringContainer& ids, FastaAlign) {
201 
202 	SEQAN_CHECKPOINT;
203 
204     SEQAN_ASSERT_NOT(_streamEOF(file));
205 
206 	typedef typename Value<TStringContainer>::Type TString;
207 	typename Position<TFile>::Type start_pos;
208 	typename Value<TFile>::Type c;
209 
210 
211 	TString id;
212 	while(true) {
213 		c = _streamGet(file);
214 		while ((!_streamEOF(file)) && (c != '>')) c = _streamGet(file);
215 		if (!_streamEOF(file)) {
216 			start_pos = _streamTellG(file);
217 			typename Size<TString>::Type count = 0;
218 			_fastaAlignScanLine(file, count);
219 			if (! count) clear(id);
220 			else {
221 				resize(id, count);
222 				if (length(id) < count)	{
223 					count = length(id);
224 				}
225 				_streamSeekG(file, start_pos);
226 				for (typename Position<TString>::Type pos = 0; pos<count; ++pos) {
227 					id[pos] = _streamGet(file);
228 				}
229 			}
230 			appendValue(ids, id);
231 		} else {
232 			break;
233 		}
234 	}
235 	_streamSeekG(file, 0);
236 }
237 
238 //////////////////////////////////////////////////////////////////////////////
239 // readMeta
240 //////////////////////////////////////////////////////////////////////////////
241 
242 //Fasta file records have no meta data
243 
244 template <typename TFile, typename TMeta>
readMeta(TFile &,TMeta & meta,FastaAlign)245 void readMeta(TFile & /*file*/, TMeta & meta, FastaAlign) {
246 	SEQAN_CHECKPOINT
247 	clear(meta);
248 }
249 
250 
251 //////////////////////////////////////////////////////////////////////////////
252 // goNext
253 //////////////////////////////////////////////////////////////////////////////
254 template <typename TFile>
goNext(TFile & file,FastaAlign)255 void goNext(TFile & file, FastaAlign) {
256 	SEQAN_CHECKPOINT;
257 	(void) file; // When compiled without assertions.
258 	SEQAN_ASSERT_NOT(_streamEOF(file));
259 
260 	return;
261 }
262 
263 
264 //////////////////////////////////////////////////////////////////////////////
265 // write
266 //////////////////////////////////////////////////////////////////////////////
267 
268 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec>
_writeImpl(TFile & file,Align<TSource,TSpec> const & align,TStringContainer const & ids,FastaAlign const &)269 void _writeImpl(TFile & file, Align<TSource, TSpec> const & align, TStringContainer const & ids, FastaAlign const &) {
270 	SEQAN_CHECKPOINT
271 
272 	typedef Align<TSource, TSpec> const TAlign;
273 	typedef typename Row<TAlign>::Type TRow;
274 	typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
275 	typedef typename Position<TAlign>::Type TPosition;
276 	TRowsPosition row_count = length(rows(align));
277 
278 	for(TRowsPosition i=0;i<row_count;++i) {
279 		TRow & row_ = row(align, i);
280 
281 		typedef typename Iterator<typename Row<TAlign>::Type const, Standard>::Type TIter;
282 		TIter begin_ = iter(row_, beginPosition(cols(align)));
283 		TIter end_ = iter(row_, endPosition(cols(align)));
284 
285 		_streamPut(file, '>');
286 		_streamWrite(file, getValue(ids,i));
287 		_streamPut(file, '\n');
288 
289 		int chars=0;
290 		while(begin_ != end_) {
291 			if (chars == 60) {
292 				_streamPut(file, '\n');
293 				chars = 0;
294 			}
295 			if (isGap(begin_)) _streamPut(file, gapValue<char>());
296 			else _streamPut(file, getValue(source(begin_)));
297 			chars++;
298 			++begin_;
299 		}
300 		_streamPut(file, '\n');
301 	}
302 }
303 
304 //____________________________________________________________________________
305 
306 template <typename TFile, typename TSource, typename TSpec>
write(TFile & file,Align<TSource,TSpec> const & align,FastaAlign const &)307 void write(TFile & file, Align<TSource, TSpec> const & align, FastaAlign const & ) {
308 	SEQAN_CHECKPOINT
309 	_writeImpl(file, align, String<String<char> >(), FastaAlign());
310 }
311 
312 //____________________________________________________________________________
313 
314 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec>
write(TFile & file,Align<TSource,TSpec> const & align,TStringContainer const & ids,FastaAlign const &)315 void write(TFile & file, Align<TSource, TSpec> const & align, TStringContainer const & ids, FastaAlign const & ) {
316 	SEQAN_CHECKPOINT
317 	_writeImpl(file, align, ids, FastaAlign());
318 }
319 
320 
321 //VisualC++ const array bug workaround
322 // TODO(holtgrew): Superflous?!
323 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec>
write(TFile & file,Align<TSource,TSpec> const * align,TStringContainer const & ids,FastaAlign const &)324 void write(TFile & file, Align<TSource, TSpec> const * align, TStringContainer const & ids, FastaAlign const & ) {
325 	SEQAN_CHECKPOINT
326 	_writeImpl(file, align, ids, FastaAlign());
327 }
328 
329 //____________________________________________________________________________
330 
331 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec, typename TMeta>
write(TFile & file,Align<TSource,TSpec> const & align,TStringContainer const & ids,TMeta &,FastaAlign const &)332 void write(TFile & file, Align<TSource, TSpec> const & align, TStringContainer const & ids, TMeta &, FastaAlign const & ) {
333 	SEQAN_CHECKPOINT;
334 	_writeImpl(file, align, ids, FastaAlign());
335 }
336 
337 }  // namespace seqan
338 
339 #endif   // #ifndef SEQAN_FILE_FILE_FORMAT_FASTA_ALIGN_H_
340