1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2010, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Andreas Gogol-Doering <doering@mdc-berlin.de>
33 // ==========================================================================
34 // Support for writing and reading FASTA alignment files.
35 // ==========================================================================
36
37 #ifndef SEQAN_FILE_FILE_FORMAT_FASTA_ALIGN_H_
38 #define SEQAN_FILE_FILE_FORMAT_FASTA_ALIGN_H_
39
40 namespace seqan {
41
42 // ===========================================================================
43 // Forward Declarations
44 // ===========================================================================
45
46 //forward declarations
47 template <typename T>
48 struct Row;
49
50 template <typename T>
51 struct Rows;
52
53 // ===========================================================================
54 // Tags, Enums, Classes, Specializations
55 // ===========================================================================
56
57 /**
58 .Tag.File Format.tag.Fasta alignment:
59 FASTA alignment file format for sequences.
60 ..include:seqan/file.h
61 */
62 struct FastaAlign_;
63 typedef Tag<FastaAlign_> FastaAlign;
64
65 // ===========================================================================
66 // Metafunctions
67 // ===========================================================================
68
69 // ===========================================================================
70 // Functions
71 // ===========================================================================
72
73 template <typename TFile, typename TSize>
_fastaAlignScanLine(TFile & file,TSize & count)74 void _fastaAlignScanLine(TFile & file, TSize & count) {
75
76 SEQAN_CHECKPOINT;
77 SEQAN_ASSERT(!_streamEOF(file))
78
79 while (true) {
80 typename Value<TFile>::Type c = _streamGet(file);
81
82 if (_streamEOF(file)) return;
83 if (c == '\n') return;
84
85 if ((c != '\r') && (c!='-'))
86 ++count;
87 }
88 }
89
90 //////////////////////////////////////////////////////////////////////////////
91 // read
92 //////////////////////////////////////////////////////////////////////////////
93 template <typename TFile, typename TSource, typename TSpec>
read(TFile & file,Align<TSource,TSpec> & align,FastaAlign const &)94 void read(TFile & file, Align<TSource, TSpec> & align, FastaAlign const &) {
95 SEQAN_CHECKPOINT;
96
97 SEQAN_ASSERT_NOT(_streamEOF(file));
98
99 typedef typename Value<TSource>::Type TSourceValue;
100 typedef typename Size<TSourceValue>::Type TSize;
101 typedef typename Position<TFile>::Type TFilePos;
102 typedef Triple<TFilePos, TFilePos, TSize> TTriple;
103 TSize limit = maxValue<TSize>();
104
105 //Determine begin position, end position and length of each sequence
106 String<TTriple> beg_end_length;
107
108 TFilePos begin_pos;
109 TFilePos end_pos;
110 typename Value<TFile>::Type c;
111 TSize count;
112
113 while (!_streamEOF(file)) {
114 begin_pos = _streamTellG(file);
115 count = 0;
116 SEQAN_ASSERT_NOT(_streamEOF(file));
117
118
119 c = _streamGet(file);
120
121 // Skip id
122 if (c == '>') {
123 _fastaAlignScanLine(file, count);
124 begin_pos = _streamTellG(file);
125 count = 0;
126 } else { //If no id first letter belongs to sequence
127 count = 1;
128 }
129
130 // Count letters
131 while (true) {
132 _fastaAlignScanLine(file, count);
133
134 typename Value<TFile>::Type c = _streamGet(file);
135 if (c == '>') {
136 _streamSeek2G(file, -1);
137 end_pos = _streamTellG(file);
138 break;
139 }
140 if (_streamEOF(file)) {
141 end_pos = _streamTellG(file);
142 break;
143 }
144 if ((c != '\n') && (c != '\r') && (c!='-')) {
145 ++count;
146 }
147 }
148 if (count > limit) {
149 count = limit;
150 }
151
152 appendValue(beg_end_length, TTriple(begin_pos, end_pos, count));
153 }
154
155 // Resize alignment data structure
156 TSize numRows=length(beg_end_length);
157 resize(rows(align), numRows); //rows
158
159 typedef Align<TSource, TSpec> TAlign;
160 typedef typename Row<TAlign>::Type TRow;
161
162 for(TSize i=0;i<numRows;++i) {
163 TSize begin = beg_end_length[i].i1;
164 // TSize end = beg_end_length[i].i2;
165 count = beg_end_length[i].i3;
166
167 //Reserve space
168 clear(row(align,i));
169 createSource(row(align,i));
170 resize(source(row(align,i)),count);
171 if (length(source(row(align,i))) < count) {
172 count = length(source(row(align,i)));
173 }
174 setClippedEndPosition(row(align,i),count);
175
176 //Read sequence
177 _streamSeekG(file, begin);
178
179 typename Position<TSource>::Type pos;
180 for (pos = 0; pos < count; ) {
181 c = _streamGet(file);
182 if ((c != '\n') && (c != '\r') && (c != '-')) {
183 source(row(align,i))[pos] = c;
184 ++pos;
185 }
186 if (c=='-') {
187 insertGap(row(align,i), toViewPosition(row(align,i), pos));
188 }
189 }
190 }
191
192 _streamSeekG(file, 0);
193 }
194
195 //////////////////////////////////////////////////////////////////////////////
196 // readIDs
197 //////////////////////////////////////////////////////////////////////////////
198
199 template <typename TFile, typename TStringContainer>
readIDs(TFile & file,TStringContainer & ids,FastaAlign)200 void readIDs(TFile& file, TStringContainer& ids, FastaAlign) {
201
202 SEQAN_CHECKPOINT;
203
204 SEQAN_ASSERT_NOT(_streamEOF(file));
205
206 typedef typename Value<TStringContainer>::Type TString;
207 typename Position<TFile>::Type start_pos;
208 typename Value<TFile>::Type c;
209
210
211 TString id;
212 while(true) {
213 c = _streamGet(file);
214 while ((!_streamEOF(file)) && (c != '>')) c = _streamGet(file);
215 if (!_streamEOF(file)) {
216 start_pos = _streamTellG(file);
217 typename Size<TString>::Type count = 0;
218 _fastaAlignScanLine(file, count);
219 if (! count) clear(id);
220 else {
221 resize(id, count);
222 if (length(id) < count) {
223 count = length(id);
224 }
225 _streamSeekG(file, start_pos);
226 for (typename Position<TString>::Type pos = 0; pos<count; ++pos) {
227 id[pos] = _streamGet(file);
228 }
229 }
230 appendValue(ids, id);
231 } else {
232 break;
233 }
234 }
235 _streamSeekG(file, 0);
236 }
237
238 //////////////////////////////////////////////////////////////////////////////
239 // readMeta
240 //////////////////////////////////////////////////////////////////////////////
241
242 //Fasta file records have no meta data
243
244 template <typename TFile, typename TMeta>
readMeta(TFile &,TMeta & meta,FastaAlign)245 void readMeta(TFile & /*file*/, TMeta & meta, FastaAlign) {
246 SEQAN_CHECKPOINT
247 clear(meta);
248 }
249
250
251 //////////////////////////////////////////////////////////////////////////////
252 // goNext
253 //////////////////////////////////////////////////////////////////////////////
254 template <typename TFile>
goNext(TFile & file,FastaAlign)255 void goNext(TFile & file, FastaAlign) {
256 SEQAN_CHECKPOINT;
257 (void) file; // When compiled without assertions.
258 SEQAN_ASSERT_NOT(_streamEOF(file));
259
260 return;
261 }
262
263
264 //////////////////////////////////////////////////////////////////////////////
265 // write
266 //////////////////////////////////////////////////////////////////////////////
267
268 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec>
_writeImpl(TFile & file,Align<TSource,TSpec> const & align,TStringContainer const & ids,FastaAlign const &)269 void _writeImpl(TFile & file, Align<TSource, TSpec> const & align, TStringContainer const & ids, FastaAlign const &) {
270 SEQAN_CHECKPOINT
271
272 typedef Align<TSource, TSpec> const TAlign;
273 typedef typename Row<TAlign>::Type TRow;
274 typedef typename Position<typename Rows<TAlign>::Type>::Type TRowsPosition;
275 typedef typename Position<TAlign>::Type TPosition;
276 TRowsPosition row_count = length(rows(align));
277
278 for(TRowsPosition i=0;i<row_count;++i) {
279 TRow & row_ = row(align, i);
280
281 typedef typename Iterator<typename Row<TAlign>::Type const, Standard>::Type TIter;
282 TIter begin_ = iter(row_, beginPosition(cols(align)));
283 TIter end_ = iter(row_, endPosition(cols(align)));
284
285 _streamPut(file, '>');
286 _streamWrite(file, getValue(ids,i));
287 _streamPut(file, '\n');
288
289 int chars=0;
290 while(begin_ != end_) {
291 if (chars == 60) {
292 _streamPut(file, '\n');
293 chars = 0;
294 }
295 if (isGap(begin_)) _streamPut(file, gapValue<char>());
296 else _streamPut(file, getValue(source(begin_)));
297 chars++;
298 ++begin_;
299 }
300 _streamPut(file, '\n');
301 }
302 }
303
304 //____________________________________________________________________________
305
306 template <typename TFile, typename TSource, typename TSpec>
write(TFile & file,Align<TSource,TSpec> const & align,FastaAlign const &)307 void write(TFile & file, Align<TSource, TSpec> const & align, FastaAlign const & ) {
308 SEQAN_CHECKPOINT
309 _writeImpl(file, align, String<String<char> >(), FastaAlign());
310 }
311
312 //____________________________________________________________________________
313
314 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec>
write(TFile & file,Align<TSource,TSpec> const & align,TStringContainer const & ids,FastaAlign const &)315 void write(TFile & file, Align<TSource, TSpec> const & align, TStringContainer const & ids, FastaAlign const & ) {
316 SEQAN_CHECKPOINT
317 _writeImpl(file, align, ids, FastaAlign());
318 }
319
320
321 //VisualC++ const array bug workaround
322 // TODO(holtgrew): Superflous?!
323 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec>
write(TFile & file,Align<TSource,TSpec> const * align,TStringContainer const & ids,FastaAlign const &)324 void write(TFile & file, Align<TSource, TSpec> const * align, TStringContainer const & ids, FastaAlign const & ) {
325 SEQAN_CHECKPOINT
326 _writeImpl(file, align, ids, FastaAlign());
327 }
328
329 //____________________________________________________________________________
330
331 template <typename TFile, typename TStringContainer, typename TSource, typename TSpec, typename TMeta>
write(TFile & file,Align<TSource,TSpec> const & align,TStringContainer const & ids,TMeta &,FastaAlign const &)332 void write(TFile & file, Align<TSource, TSpec> const & align, TStringContainer const & ids, TMeta &, FastaAlign const & ) {
333 SEQAN_CHECKPOINT;
334 _writeImpl(file, align, ids, FastaAlign());
335 }
336
337 } // namespace seqan
338
339 #endif // #ifndef SEQAN_FILE_FILE_FORMAT_FASTA_ALIGN_H_
340