1 // ==========================================================================
2 // SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 // * Redistributions of source code must retain the above copyright
11 // notice, this list of conditions and the following disclaimer.
12 // * Redistributions in binary form must reproduce the above copyright
13 // notice, this list of conditions and the following disclaimer in the
14 // documentation and/or other materials provided with the distribution.
15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 // its contributors may be used to endorse or promote products derived
17 // from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
33 // ==========================================================================
34 // This file contains routines to read BLAST tab-seperated output
35 // ==========================================================================
36
37 #ifndef SEQAN_BLAST_BLAST_TABULAR_READ_H_
38 #define SEQAN_BLAST_BLAST_TABULAR_READ_H_
39
40 /* IMPLEMENTATION NOTES
41
42 BLAST TABULAR example:
43
44 The format of a blast tabular output file is less simple than it looks, here's
45 the general form
46
47 COMMENTLINES
48 MATCH
49 MATCH
50 MATCH
51 MATCH
52 COMMENTLINES
53 MATCH
54 COMMENTLINES
55 COMMENTLINES
56 ...
57
58 => COMMENTLINES for each sequence, 0-n Matchs for each sequence
59 => Each record is one-line, each COMMENTLINES is multiline
60
61 COMMENTLINES usually consists of:
62
63 # Program Tag [e.g BLASTX 2.2.27+ ]
64 # Query Id [ID of the query *sequence*]
65 # Database Id [note that this is not the name of the sequence in the db, but of
66 the database itself, e.g. "nr" -> usually the same for each file]
67 # Fields: [Headers of columns]
68 # n "hits found"
69
70 The first three lines are always written.
71 The Fields line is always writen by NCBI Blast, but only when hits > 0 by NCBI Blast+.
72 The "number of hits"-line is always printed by NCBI Blast+, and never by NCBI Blast.
73
74 Possibly other lines can be written as comments.
75
76 Because 0 matches are allowed, multiple COMMENTLINES can succeed each other, the
77 criterium for separation employed by this implementation is that an NCBI Blast
78 COMMENTLINES always ends after the "Fields" line and NCBI Blast+ COMMENTLINES end after
79 the "number of hits"-line.
80 */
81
82 namespace seqan
83 {
84
85 // ============================================================================
86 // Forwards
87 // ============================================================================
88
89 // ============================================================================
90 // Tags, Classes, Enums
91 // ============================================================================
92
93 // ----------------------------------------------------------------------------
94 // Type BlastFileIn
95 // ----------------------------------------------------------------------------
96
97 /*!
98 * @class BlastTabularFileIn
99 * @signature template <typename TBlastIOContext>
100 * using BlastTabularFileIn = FormattedFile<BlastTabular, Input, TBlastIOContext>;
101 * @extends FormattedFileIn
102 * @headerfile <seqan/blast.h>
103 * @brief FormattedFileIn abstraction for @link BlastTabular @endlink
104 *
105 * This is a @link FormattedFile @endlink specialization for reading @link BlastTabular @endlink formats. For details
106 * on how to influence the reading of files and how to differentiate between the tabular format without comment lines
107 * and the one with comment lines, see @link BlastIOContext @endlink.
108 * Please note that you have specify the type of the context as a template parameter to BlastTabularFileIn.
109 *
110 * @section Overview
111 *
112 * <ul>
113 * <li> open @link BlastTabularFileIn @endlink</li>
114 * <li> @link BlastTabularFileIn#readHeader @endlink </li>
115 * <li> while @link BlastTabularFileIn#onRecord @endlink </li>
116 * <ul><li> @link BlastTabularFileIn#readRecord @endlink</li></ul>
117 * <li> @link BlastTabularFileIn#readFooter @endlink </li>
118 * </ul>
119 *
120 * For a detailed example have a look at the
121 * <a href="http://seqan.readthedocs.io/en/develop/Tutorial/InputOutput/BlastIO.html">Blast IO tutorial</a>.
122 *
123 * @see BlastRecord
124 */
125
126 template <typename TBlastIOContext = BlastIOContext<>>
127 using BlastTabularFileIn = FormattedFile<BlastTabular, Input, TBlastIOContext>;
128
129 // ============================================================================
130 // Metafunctions
131 // ============================================================================
132
133 // ============================================================================
134 // Functions
135 // ============================================================================
136
137 // ----------------------------------------------------------------------------
138 // Function atEnd()
139 // ----------------------------------------------------------------------------
140
141 template <typename TScore,
142 typename TFwdIterator,
143 BlastProgram p,
144 BlastTabularSpec h>
145 inline bool
atEnd(BlastIOContext<TScore,p,h> const & context,TFwdIterator const & iter,BlastTabular const &)146 atEnd(BlastIOContext<TScore, p, h> const & context,
147 TFwdIterator const & iter,
148 BlastTabular const &)
149 {
150 return (atEnd(iter) && empty(context._lineBuffer));
151 }
152
153 template <typename TContext>
154 inline bool
atEnd(BlastTabularFileIn<TContext> & formattedFile)155 atEnd(BlastTabularFileIn<TContext> & formattedFile)
156 {
157 return atEnd(context(formattedFile), formattedFile.iter, BlastTabular());
158 }
159
160 // ----------------------------------------------------------------------------
161 // Function guessFormat()
162 // ----------------------------------------------------------------------------
163
164 template <typename TSpec>
guessFormat(FormattedFile<BlastTabular,Input,TSpec> & file)165 inline bool guessFormat(FormattedFile<BlastTabular, Input, TSpec> & file)
166 {
167 if (value(file.iter) == '#')
168 context(file).tabularSpec = BlastTabularSpec::COMMENTS;
169 else
170 context(file).tabularSpec = BlastTabularSpec::NO_COMMENTS;
171 return true;
172 }
173
174 // ----------------------------------------------------------------------------
175 // Function _onMatch()
176 // ----------------------------------------------------------------------------
177
178 template <typename TScore,
179 BlastProgram p,
180 BlastTabularSpec h>
181 inline bool
_onMatch(BlastIOContext<TScore,p,h> & context,BlastTabular const &)182 _onMatch(BlastIOContext<TScore, p, h> & context,
183 BlastTabular const &)
184 {
185 return (!startsWith(context._lineBuffer, "#") && !empty(context._lineBuffer));
186 }
187
188 // ----------------------------------------------------------------------------
189 // Function onRecord()
190 // ----------------------------------------------------------------------------
191
192 /*!
193 * @fn BlastTabularFileIn#onRecord
194 * @brief Returns whether the currently buffered line looks like the start of a record.
195 * @signature bool onRecord(blastTabularIn);
196 * @headerfile seqan/blast.h
197 *
198 * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
199 *
200 * @throw IOError On low-level I/O errors.
201 * @throw ParseError On high-level file format errors.
202 *
203 * @return bool true or false
204 */
205
206 template <typename TScore,
207 BlastProgram p,
208 BlastTabularSpec h>
209 inline bool
onRecord(BlastIOContext<TScore,p,h> & context,BlastTabular const &)210 onRecord(BlastIOContext<TScore, p, h> & context,
211 BlastTabular const &)
212 {
213 if (context.tabularSpec == BlastTabularSpec::NO_COMMENTS)
214 return _onMatch(context, BlastTabular());
215
216 // comment lines Footer
217 return (startsWith(context._lineBuffer, "#") && (!startsWith(context._lineBuffer, "# BLAST processed ")));
218 }
219
220 template <typename TContext>
221 inline bool
onRecord(BlastTabularFileIn<TContext> & formattedFile)222 onRecord(BlastTabularFileIn<TContext> & formattedFile)
223 {
224 return onRecord(context(formattedFile), BlastTabular());
225 }
226
227 // ----------------------------------------------------------------------------
228 // Function _goNextLine()
229 // ----------------------------------------------------------------------------
230
231 template <typename TScore,
232 typename TFwdIterator,
233 BlastProgram p,
234 BlastTabularSpec h>
235 inline void
_goNextLine(BlastIOContext<TScore,p,h> & context,TFwdIterator & iter,BlastTabular const &)236 _goNextLine(BlastIOContext<TScore, p, h> & context,
237 TFwdIterator & iter,
238 BlastTabular const &)
239 {
240 clear(context._lineBuffer);
241 if (!atEnd(iter))
242 readLine(context._lineBuffer, iter);
243 }
244
245 // ----------------------------------------------------------------------------
246 // Function _readCommentLines()
247 // ----------------------------------------------------------------------------
248
249 template <typename ... TSpecs,
250 typename TFwdIterator,
251 typename TScore,
252 BlastProgram p,
253 BlastTabularSpec h>
254 inline void
_readCommentLinesImpl(BlastRecord<TSpecs...> & r,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)255 _readCommentLinesImpl(BlastRecord<TSpecs...> & r,
256 TFwdIterator & iter,
257 BlastIOContext<TScore, p, h> & context,
258 BlastTabular const &)
259 {
260 // this is a match instead of comment lines
261 if (_onMatch(context, BlastTabular()))
262 SEQAN_THROW(ParseError("Commen lines expected, but not found."));
263 else
264 context.tabularSpec = BlastTabularSpec::COMMENTS;
265
266 int queryLinePresent = 0;
267 int dbLinePresent = 0;
268 int fieldsLinePresent = 0;
269 int hitsLinePresent = 0;
270
271 do
272 {
273 if (std::regex_search(std::begin(context._lineBuffer),
274 std::end(context._lineBuffer),
275 std::regex("^# T?BLAST")))
276 {
277 // last line of file
278 if (SEQAN_UNLIKELY(startsWith(context._lineBuffer, "# BLAST processed ") && !context.legacyFormat))
279 {
280 SEQAN_FAIL("ERROR: You called readRecord() when you should have called readFooter()."
281 "Always check onRecord() before calling readRecord().");
282 }
283 else // first line of the comments
284 {
285 assign(context.versionString, suffix(context._lineBuffer, 2));
286 context.blastProgram = _programStringToTag(prefix(context.versionString,
287 std::find(begin(context.versionString, Standard()),
288 end(context.versionString, Standard()),
289 ' ')
290 - begin(context.versionString, Standard())));
291
292 context.legacyFormat = !std::regex_search(std::begin(context.versionString),
293 std::end(context.versionString),
294 std::regex("\\d\\.\\d\\.\\d{1,2}\\+"));
295 }
296 }
297 else if (startsWith(context._lineBuffer, "# Query:"))
298 {
299 ++queryLinePresent;
300 assign(r.qId, suffix(context._lineBuffer, 9));
301
302 }
303 else if (startsWith(context._lineBuffer, "# Database:"))
304 {
305 ++dbLinePresent;
306 assign(context.dbName, suffix(context._lineBuffer, 12));
307 }
308 else if (startsWith(context._lineBuffer, "# Fields:"))
309 {
310 ++fieldsLinePresent;
311 clear(context._stringBuffer);
312 // make sure target is big enough
313 resize(context._stringBuffer, length(context._lineBuffer) - 10);
314 // use escape character as placeholder for replacement since strSplit only handles single characters
315 std::regex_replace(std::begin(context._stringBuffer),
316 std::begin(context._lineBuffer) + 10, // skip "# Fields:"
317 std::end(context._lineBuffer),
318 std::regex(", "),
319 "\x7F");
320 // shrink back down to actual size (replacing two letters with one makes string shorter!)
321 resize(context._stringBuffer, length(context._stringBuffer.c_str()));
322 strSplit(context.fieldsAsStrings, context._stringBuffer, EqualsChar<'\x7F'>(), true);
323
324 if (context.legacyFormat)
325 {
326 // assume defaults for LEGACY
327 if (!context.ignoreFieldsInComments)
328 appendValue(context.fields, BlastMatchField<>::Enum::STD);
329 }
330 else if (!context.ignoreFieldsInComments)
331 {
332 bool defaults = true;
333 resize(context.fields, length(context.fieldsAsStrings));
334 for (uint8_t j = 0; j < length(context.fieldsAsStrings); ++j)
335 {
336 for (uint8_t i = 0; i < length(BlastMatchField<>::columnLabels); ++i)
337 {
338 if (context.fieldsAsStrings[j] ==
339 BlastMatchField<>::columnLabels[i])
340 {
341 context.fields[j] =
342 static_cast<BlastMatchField<>::Enum>(i);
343
344 if ((j >= length(BlastMatchField<>::defaults)) &&
345 (static_cast<BlastMatchField<>::Enum>(i) !=
346 BlastMatchField<>::defaults[j]))
347 defaults = false;
348 break;
349 }
350 }
351 }
352 if (defaults) // replace multiple fields with the default meta field
353 {
354 clear(context.fields);
355 appendValue(context.fields, BlastMatchField<>::Enum::STD);
356 }
357 }
358 }
359 else
360 {
361 if (!context.legacyFormat)
362 {
363 // is hits counter?
364 if (endsWith(context._lineBuffer, "hits found"))
365 {
366 clear(context._stringBuffer);
367 for (unsigned i = 2; (i < length(context._lineBuffer) && isdigit(context._lineBuffer[i])); ++i)
368 appendValue(context._stringBuffer, context._lineBuffer[i], Generous());
369
370 uint64_t hits = lexicalCast<uint64_t>(context._stringBuffer);
371
372 if (hits)
373 {
374 resize(r.matches, hits);
375 }
376 else // hits = 0 means no fieldList, restore default
377 {
378 appendValue(context.fields, BlastMatchField<>::Enum::STD);
379
380 context._stringBuffer = std::regex_replace(BlastMatchField<>::columnLabels[0],
381 std::regex(", "),
382 "\x7F");
383 strSplit(context.fieldsAsStrings, context._stringBuffer, EqualsChar<'\x7F'>(), true);
384 }
385
386 ++hitsLinePresent;
387 // break; // comments are finished
388 }
389 else
390 {
391 appendValue(context.otherLines, suffix(context._lineBuffer, 1), Generous());
392 }
393 }
394 else
395 {
396 appendValue(context.otherLines, suffix(context._lineBuffer, 1), Generous());
397 }
398 }
399
400 _goNextLine(context, iter, BlastTabular());
401
402 } while (startsWith(context._lineBuffer, "#") && // still on comments
403 !std::regex_search(std::begin(context._lineBuffer),// start of next record
404 std::end(context._lineBuffer),
405 std::regex("^# T?BLAST")));
406
407 if (context.blastProgram == BlastProgram::UNKNOWN)
408 appendValue(context.conformancyErrors,
409 "Type of BlastProgram could not be determined from comments, you are advised to look "
410 "at context.versionString and context.otherLines.");
411
412 if (queryLinePresent != 1)
413 appendValue(context.conformancyErrors, "No or multiple query lines present.");
414
415 if (dbLinePresent != 1)
416 appendValue(context.conformancyErrors, "No or multiple database lines present.");
417
418 if (context.legacyFormat)
419 {
420 if (fieldsLinePresent != 1)
421 appendValue(context.conformancyErrors, "No or multiple fields lines present.");
422 }
423 else
424 {
425 // is omitted in BLAST_PLUS when there are no hits
426 if ((fieldsLinePresent != 1) && (length(r.matches) > 0))
427 appendValue(context.conformancyErrors, "No or multiple fields lines present.");
428 }
429
430 if (!empty(context.otherLines))
431 appendValue(context.conformancyErrors, "Unexpected lines present, see context.otherLines.");
432 }
433
434 template <typename ... TSpecs,
435 typename TFwdIterator,
436 typename TScore,
437 BlastProgram p,
438 BlastTabularSpec h>
439 inline void
_readCommentLines(BlastRecord<TSpecs...> & r,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)440 _readCommentLines(BlastRecord<TSpecs...> & r,
441 TFwdIterator & iter,
442 BlastIOContext<TScore, p, h> & context,
443 BlastTabular const &)
444 {
445 ++context._numberOfRecords;
446
447 if (context.tabularSpec == BlastTabularSpec::NO_COMMENTS)
448 return;
449
450 clear(r);
451 clear(context.versionString);
452 clear(context.dbName);
453 clear(context.otherLines);
454 clear(context.conformancyErrors);
455 clear(context.fieldsAsStrings);
456
457 if (!context.ignoreFieldsInComments)
458 clear(context.fields);
459
460 _readCommentLinesImpl(r, iter, context, BlastTabular());
461 }
462
463 // ----------------------------------------------------------------------------
464 // Function _readField()
465 // ----------------------------------------------------------------------------
466
467 template <typename TAlignRow0,
468 typename TAlignRow1,
469 typename TPos,
470 typename TQId,
471 typename TSId,
472 typename TScore,
473 typename ... TSpecs,
474 BlastProgram p,
475 BlastTabularSpec h>
476 inline void
_readField(BlastMatch<TAlignRow0,TAlignRow1,TPos,TQId,TSId> & match,BlastRecord<TSpecs...> & record,BlastIOContext<TScore,p,h> & context,typename BlastMatchField<>::Enum const fieldId)477 _readField(BlastMatch<TAlignRow0, TAlignRow1, TPos, TQId, TSId> & match,
478 BlastRecord<TSpecs...> & record,
479 BlastIOContext<TScore, p, h> & context,
480 typename BlastMatchField<>::Enum const fieldId)
481 {
482 switch (fieldId)
483 {
484 case BlastMatchField<>::Enum::STD: // this is cought in the calling function
485 break;
486 case BlastMatchField<>::Enum::Q_SEQ_ID:
487 match.qId = context._stringBuffer;
488 break;
489 // case ENUM::Q_GI: write(s, * ); break;
490 case BlastMatchField<>::Enum::Q_ACC:
491 appendValue(record.qAccs, context._stringBuffer);
492 break;
493 // case ENUM::Q_ACCVER: write(s, * ); break;
494 case BlastMatchField<>::Enum::Q_LEN:
495 match.qLength = lexicalCast<TPos>(context._stringBuffer);
496 record.qLength = match.qLength;
497 break;
498 case BlastMatchField<>::Enum::S_SEQ_ID:
499 match.sId = context._stringBuffer;
500 break;
501 // case ENUM::S_ALL_SEQ_ID: write(s, * ); break;
502 // case ENUM::S_GI: write(s, * ); break;
503 // case ENUM::S_ALL_GI: write(s, * ); break;
504 case BlastMatchField<>::Enum::S_ACC:
505 appendValue(match.sAccs, context._stringBuffer);
506 break;
507 // case ENUM::S_ACCVER: write(s, * ); break;
508 case BlastMatchField<>::Enum::S_ALLACC:
509 strSplit(match.sAccs, context._stringBuffer, EqualsChar<';'>());
510 break;
511 case BlastMatchField<>::Enum::S_LEN:
512 match.sLength = lexicalCast<TPos>(context._stringBuffer);
513 break;
514 case BlastMatchField<>::Enum::Q_START:
515 match.qStart = lexicalCast<TPos>(context._stringBuffer);
516 break;
517 case BlastMatchField<>::Enum::Q_END:
518 match.qEnd = lexicalCast<TPos>(context._stringBuffer);
519 break;
520 case BlastMatchField<>::Enum::S_START:
521 match.sStart = lexicalCast<TPos>(context._stringBuffer);
522 break;
523 case BlastMatchField<>::Enum::S_END:
524 match.sEnd = lexicalCast<TPos>(context._stringBuffer);
525 break;
526 // case ENUM::Q_SEQ: write(s, * ); break;
527 // case ENUM::S_SEQ: write(s, * ); break;
528 case BlastMatchField<>::Enum::E_VALUE:
529 match.eValue = lexicalCast<double>(context._stringBuffer);
530 break;
531 case BlastMatchField<>::Enum::BIT_SCORE:
532 match.bitScore = lexicalCast<double>(context._stringBuffer);
533 break;
534 case BlastMatchField<>::Enum::SCORE:
535 match.alignStats.alignmentScore = lexicalCast<TPos>(context._stringBuffer);
536 break;
537 case BlastMatchField<>::Enum::LENGTH:
538 match.alignStats.alignmentLength = lexicalCast<TPos>(context._stringBuffer);
539 break;
540 case BlastMatchField<>::Enum::P_IDENT:
541 match.alignStats.alignmentIdentity = lexicalCast<double>(context._stringBuffer);
542 break;
543 case BlastMatchField<>::Enum::N_IDENT:
544 match.alignStats.numMatches = lexicalCast<TPos>(context._stringBuffer);
545 break;
546 case BlastMatchField<>::Enum::MISMATCH:
547 match.alignStats.numMismatches = lexicalCast<TPos>(context._stringBuffer);
548 break;
549 case BlastMatchField<>::Enum::POSITIVE:
550 match.alignStats.numPositiveScores = lexicalCast<TPos>(context._stringBuffer);
551 break;
552 case BlastMatchField<>::Enum::GAP_OPEN:
553 match.alignStats.numGapOpens = lexicalCast<TPos>(context._stringBuffer);
554 break;
555 case BlastMatchField<>::Enum::GAPS:
556 match.alignStats.numGaps = lexicalCast<TPos>(context._stringBuffer);
557 break;
558 case BlastMatchField<>::Enum::P_POS:
559 match.alignStats.alignmentSimilarity = lexicalCast<double>(context._stringBuffer);
560 break;
561 case BlastMatchField<>::Enum::FRAMES:
562 {
563 clear(context._setBuffer2);
564 strSplit(context._setBuffer2, context._stringBuffer, EqualsChar<'/'>());
565 if (length(context._setBuffer2) != 2)
566 SEQAN_THROW(ParseError("Could not process frame string."));
567 match.qFrameShift = lexicalCast<int8_t>(context._setBuffer2[0]);
568 match.sFrameShift = lexicalCast<int8_t>(context._setBuffer2[1]);
569 } break;
570 case BlastMatchField<>::Enum::Q_FRAME:
571 match.qFrameShift = lexicalCast<int8_t>(context._stringBuffer);
572 break;
573 case BlastMatchField<>::Enum::S_FRAME:
574 match.sFrameShift = lexicalCast<int8_t>(context._stringBuffer);
575 break;
576 // case ENUM::BTOP: write( * ); break;
577 case BlastMatchField<>::Enum::S_TAX_IDS:
578 {
579 StringSet<CharString> temp;
580 strSplit(temp, context._stringBuffer, EqualsChar<';'>());
581 for (auto const & s : temp)
582 appendValue(match.sTaxIds, lexicalCast<uint64_t>(s));
583 } break;
584 // case ENUM::S_SCI_NAMES: write( * ); break;
585 // case ENUM::S_COM_NAMES: write( * ); break;
586 // case ENUM::S_BLAST_NAMES: write( * ); break;
587 // case ENUM::S_S_KINGDOMS: write( * ); break;
588 // case ENUM::S_TITLE: write( * ); break;
589 // case ENUM::S_ALL_TITLES: write( * ); break;
590 // case ENUM::S_STRAND: write( * ); break;
591 // case ENUM::Q_COV_S: write( * ); break;
592 // case ENUM::Q_COV_HSP:
593 case BlastMatchField<>::Enum::LCA_ID:
594 record.lcaId = context._stringBuffer;
595 break;
596 case BlastMatchField<>::Enum::LCA_TAX_ID:
597 record.lcaTaxId = lexicalCast<uint32_t>(context._stringBuffer);
598 break;
599 default:
600 SEQAN_THROW(ParseError("The requested column type is not yet "
601 "implemented."));
602 };
603 }
604
605 // ----------------------------------------------------------------------------
606 // Function _readMatch()
607 // ----------------------------------------------------------------------------
608
609 template <typename TAlignRow0,
610 typename TAlignRow1,
611 typename TPos,
612 typename TQId,
613 typename TSId,
614 typename TFwdIterator,
615 typename TScore,
616 typename ... TSpecs,
617 BlastProgram p,
618 BlastTabularSpec h>
619 inline void
_readMatch(BlastMatch<TAlignRow0,TAlignRow1,TPos,TQId,TSId> & match,BlastRecord<TSpecs...> & record,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)620 _readMatch(BlastMatch<TAlignRow0, TAlignRow1, TPos, TQId, TSId> & match,
621 BlastRecord<TSpecs...> & record,
622 TFwdIterator & iter,
623 BlastIOContext<TScore, p, h> & context,
624 BlastTabular const &)
625 {
626 if (context.legacyFormat)
627 {
628 #if SEQAN_ENABLE_DEBUG
629 if ((length(context.fields) != 1) || (context.fields[0] != BlastMatchField<>::Enum::STD))
630 std::cerr << "Warning: custom fields set, but will be reset, because legacyFormat is also set.\n";
631 #endif
632 // set defaults
633 clear(context.fields);
634 appendValue(context.fields, BlastMatchField<>::Enum::STD);
635 }
636
637 // comments should have been read or skipped
638 if (SEQAN_UNLIKELY(!_onMatch(context, BlastTabular())))
639 SEQAN_THROW(ParseError("Not on beginning of Match (you should have skipped comments)."));
640
641 match._maxInitialize(); // mark all members as "not set"
642
643 clear(context._setBuffer1);
644
645 auto & fields = context._setBuffer1;
646 strSplit(fields, context._lineBuffer, IsTab());
647
648 // line in buffer was processed so new line is read from iter into buffer
649 _goNextLine(context, iter, BlastTabular());
650
651 unsigned n = 0u;
652 for (BlastMatchField<>::Enum const f : context.fields)
653 {
654 // this field represents multiple fields
655 if (f == BlastMatchField<>::Enum::STD)
656 {
657 for (typename BlastMatchField<>::Enum const f2 : BlastMatchField<>::defaults)
658 {
659 if (SEQAN_UNLIKELY(n >= length(fields)))
660 SEQAN_THROW(ParseError("More columns expected than were present in file."));
661
662 context._stringBuffer = static_cast<decltype(context._stringBuffer)>(fields[n++]);
663 _readField(match, record, context, f2);
664 }
665 } else
666 {
667 if (SEQAN_UNLIKELY(n >= length(fields)))
668 SEQAN_THROW(ParseError("More columns expected than were present in file."));
669
670 context._stringBuffer = static_cast<decltype(context._stringBuffer)>(fields[n++]);
671 _readField(match, record, context, f);
672 }
673 }
674
675 // retransform the percentages to real numbers
676 if (!_memberIsSet(match.alignStats.numMatches) &&
677 _memberIsSet(match.alignStats.alignmentLength) &&
678 _memberIsSet(match.alignStats.alignmentIdentity))
679 match.alignStats.numMatches = std::lround(double(match.alignStats.alignmentLength) *
680 match.alignStats.alignmentIdentity / 100.0);
681 if (!_memberIsSet(match.alignStats.numPositiveScores) &&
682 _memberIsSet(match.alignStats.alignmentLength) &&
683 _memberIsSet(match.alignStats.alignmentSimilarity))
684 match.alignStats.numPositiveScores = std::lround(double(match.alignStats.alignmentLength) *
685 match.alignStats.alignmentSimilarity / 100.0);
686
687 //TODO also transform numbers to percentages; possibly do other calculations
688
689 // and compute gaps from other values
690 // since gaps are included in the mismatch count in BLAST_LEGACY they cannot be computed here
691 if (!context.legacyFormat &&
692 !_memberIsSet(match.alignStats.numGaps) &&
693 _memberIsSet(match.alignStats.alignmentLength) &&
694 _memberIsSet(match.alignStats.numMatches) &&
695 _memberIsSet(match.alignStats.numMismatches))
696 match.alignStats.numGaps = match.alignStats.alignmentLength -
697 match.alignStats.numMatches -
698 match.alignStats.numMismatches;
699 }
700
701 // ----------------------------------------------------------------------------
702 // Function readRecord()
703 // ----------------------------------------------------------------------------
704
705 template <typename TFwdIterator,
706 typename ... TSpecs,
707 typename TScore,
708 BlastProgram p,
709 BlastTabularSpec h>
710 inline void
_readRecordWithCommentLines(BlastRecord<TSpecs...> & blastRecord,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)711 _readRecordWithCommentLines(BlastRecord<TSpecs...> & blastRecord,
712 TFwdIterator & iter,
713 BlastIOContext<TScore, p, h> & context,
714 BlastTabular const &)
715 {
716 _readCommentLines(blastRecord, iter, context, BlastTabular());
717
718 if (!context.legacyFormat) // this is detected from the comments
719 {
720 // .matches already resized for us
721 for (auto & m : blastRecord.matches)
722 {
723 if (!_onMatch(context, BlastTabular()))
724 {
725 appendValue(context.conformancyErrors,
726 "Less matches than promised by the comments");
727 break;
728 }
729
730 _readMatch(m, blastRecord, iter, context, BlastTabular());
731 }
732
733 if ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
734 appendValue(context.conformancyErrors,
735 "More matches than promised by the comments");
736
737 while ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
738 {
739 blastRecord.matches.emplace_back();
740 _readMatch(back(blastRecord.matches), blastRecord, iter, context, BlastTabular());
741 }
742 } else
743 {
744 while ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
745 {
746 blastRecord.matches.emplace_back();
747 _readMatch(back(blastRecord.matches), blastRecord, iter, context, BlastTabular());
748 }
749 }
750 }
751
752 template <typename ... TSpecs,
753 typename TFwdIterator,
754 typename TScore,
755 BlastProgram p,
756 BlastTabularSpec h>
757 inline void
_readRecordWithoutCommentLines(BlastRecord<TSpecs...> & blastRecord,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)758 _readRecordWithoutCommentLines(BlastRecord<TSpecs...> & blastRecord,
759 TFwdIterator & iter,
760 BlastIOContext<TScore, p, h> & context,
761 BlastTabular const &)
762 {
763 ++context._numberOfRecords;
764
765 clear(blastRecord);
766
767 auto it = begin(context._lineBuffer, Rooted()); // move into line below when seqan supports && properly
768 readUntil(blastRecord.qId, it, IsTab());
769
770 auto curIdPlusTab = blastRecord.qId;
771 appendValue(curIdPlusTab, '\t');
772
773 if ((context.fields[0] != BlastMatchField<>::Enum::STD) &&
774 (context.fields[0] != BlastMatchField<>::Enum::Q_SEQ_ID))
775 {
776 SEQAN_FAIL("ERROR: readRecord interface on comment-less format with custom "
777 "fields not supported, unless first custom field is "
778 "Q_SEQ_ID. Use the lowlevel readMatch interface instead.");
779 }
780
781 while ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
782 {
783 blastRecord.matches.emplace_back();
784 // read remainder of line
785 _readMatch(back(blastRecord.matches), blastRecord, iter, context, BlastTabular());
786
787 if (!startsWith(context._lineBuffer, curIdPlusTab)) // next record reached
788 break;
789 }
790
791 if (empty(blastRecord.matches))
792 SEQAN_THROW(ParseError("No Matches could be read."));
793 }
794
795 /*!
796 * @fn BlastTabularFileIn#readRecord
797 * @brief Read a record from a file in BlastTabular format.
798 * @headerfile seqan/blast.h
799 * @signature void readRecord(blastRecord, blastTabularIn);
800 *
801 * @param[out] blastRecord A @link BlastRecord @endlink to hold all information related to one query sequence.
802 * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
803 *
804 * @section Remarks
805 *
806 * This function will read an entire record from a blast tabular file, i.e. it will read the comment lines
807 * (if the format is @link BlastTabularSpec::COMMENTS @endlink) and 0-n @link BlastMatch @endlinkes belonging
808 * to one query.
809 *
810 * Please note that if there are no comment lines in the file the boundary
811 * between records is inferred from the indentity of the first field, i.e.
812 * non-standard field configurations must also have Q_SEQ_ID as their first @link BlastMatchField @endlink.
813 *
814 * @subsection Comment lines
815 *
816 * The @link BlastRecord::qId @endlink member of the record is read from the comment lines and the
817 * @link BlastRecord::matches @endlink are resized to the expected number of matches succeeding the comments.
818 *
819 * This function also sets many properties of blastTabularIn's @link BlastIOContext @endlink, including these members:
820 * <li> @link BlastIOContext::versionString @endlink: version string of the program.</li>
821 * <li> @link BlastIOContext::dbName @endlink: name of the database.</li>
822 * <li> @link BlastIOContext::fields @endlink: descriptors for the columns.</li>
823 * <li> @link BlastIOContext::fieldsAsStrings @endlink: labels of the columns
824 * as they appear in the file.</li>
825 * <li> @link BlastIOContext::conformancyErrors @endlink: if this StringSet is not empty, then there are issues in the
826 * comments.</li>
827 * <li> @link BlastIOContext::otherLines @endlink: any lines that cannot be interpreted; these always also imply
828 * conformancyErrors.</li>
829 * <li>@link BlastIOContext::legacyFormat @endlink: whether the record (and likely the entire file) is in
830 * legacyFormat.</li>
831 *
832 * It also sets the blast program run-time parameter of the context depending on the information found in the comments. If
833 * the compile time parameter was set on the context and they are different this will result in a critical error.
834 *
835 * Please note that for @link BlastIOContext::legacyFormat @endlink the @link BlastIOContext::fields @endlink member
836 * is always ignored, however @link BlastIOContext::fieldsAsStrings @endlink is still read from the comments, in case
837 * you want to process it.
838 *
839 * In case you do not wish the @link BlastIOContext::fields @endlink to be read from the comments, you can set
840 * context.@link BlastIOContext::ignoreFieldsInComments @endlink to true. This will be prevent it from being read and will
841 * allow you to specify it manually which might be relevant for reading the match lines.
842 *
843 * If the format is @link BlastTabularSpec::NO_COMMENTS @endlink none of the above happens and
844 * @link BlastRecord::qId @endlink is derived from the first match.
845 *
846 * @subsection Matches
847 *
848 * A match line contains 1 - n columns or fields, 12 by default.
849 * The @link BlastIOContext::fields @endlink member of the context is considered when reading these fields. It is
850 * usually extracted from the comment lines but can also be set by yourself if there are no comments or if you want
851 * to overwrite the comments' information (see above).
852 * You may specify less
853 * fields than are actually present, in this case the additional fields will be
854 * discarded. The parameter is ignored if @link BlastIOContext::legacyFormat @endlink is set.
855 *
856 * To differentiate between members of a @link BlastMatch @endlink that were read from the file and those that have
857 * not been set (e.g. both could be 0), the latter are initialized to their respective max-values.
858 *
859 * Please note that the only transformations made to the data are the following:
860 *
861 * <li> computation of the number of identities (from the percentage) [default]</li>
862 * <li> computation of the number of positives (from the percentage) [if given]</li>
863 * <li> number of gaps computed from other values [default]</li>
864 *
865 * In contrast to @link BlastTabularFileOut#writeRecord @endlink no other transformations
866 * are made, e.g. the positions are still one-indexed and
867 * flipped for reverse strand matches. This is due to the required fields for
868 * retransformation (sequence lengths, frames) not being available in the
869 * default columns.
870 *
871 * @throw IOError On low-level I/O errors.
872 * @throw ParseError On high-level file format errors.
873 */
874
875 template <typename ... TSpecs,
876 typename TFwdIterator,
877 typename TScore,
878 BlastProgram p,
879 BlastTabularSpec h>
880 inline void
readRecord(BlastRecord<TSpecs...> & blastRecord,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)881 readRecord(BlastRecord<TSpecs...> & blastRecord,
882 TFwdIterator & iter,
883 BlastIOContext<TScore, p, h> & context,
884 BlastTabular const &)
885 {
886 if (context.tabularSpec == BlastTabularSpec::NO_COMMENTS)
887 _readRecordWithoutCommentLines(blastRecord, iter, context, BlastTabular());
888 else
889 _readRecordWithCommentLines(blastRecord, iter, context, BlastTabular());
890 }
891
892 template <typename ... TSpecs,
893 typename TContext>
894 inline void
readRecord(BlastRecord<TSpecs...> & blastRecord,BlastTabularFileIn<TContext> & formattedFile)895 readRecord(BlastRecord<TSpecs...> & blastRecord,
896 BlastTabularFileIn<TContext> & formattedFile)
897 {
898 readRecord(blastRecord, formattedFile.iter, context(formattedFile), BlastTabular());
899 }
900
901 // ----------------------------------------------------------------------------
902 // Function readHeader()
903 // ----------------------------------------------------------------------------
904
905 /*!
906 * @fn BlastTabularFileIn#readHeader
907 * @headerfile seqan/blast.h
908 * @brief Read the header (top-most section) of a BlastTabular file.
909 * @signature void readHeader(blastTabularIn);
910 *
911 * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
912 *
913 * @throw IOError On low-level I/O errors.
914 * @throw ParseError On high-level file format errors.
915 */
916
917 template <typename TFwdIterator,
918 typename TScore,
919 BlastProgram p,
920 BlastTabularSpec h>
921 inline void
readHeader(BlastIOContext<TScore,p,h> & context,TFwdIterator & iter,BlastTabular const &)922 readHeader(BlastIOContext<TScore, p, h> & context,
923 TFwdIterator & iter,
924 BlastTabular const & /*tag*/)
925 {
926 readLine(context._lineBuffer, iter); // fill the line-buffer the first time
927
928 if (context.tabularSpec == BlastTabularSpec::UNKNOWN)
929 {
930 if (_onMatch(context, BlastTabular()))
931 context.tabularSpec = BlastTabularSpec::NO_COMMENTS;
932 else
933 context.tabularSpec = BlastTabularSpec::COMMENTS;
934 }
935 }
936
937 template <typename TContext>
938 inline void
readHeader(BlastTabularFileIn<TContext> & formattedFile)939 readHeader(BlastTabularFileIn<TContext> & formattedFile)
940 {
941 readHeader(context(formattedFile), formattedFile.iter, BlastTabular());
942 }
943
944 // ----------------------------------------------------------------------------
945 // Function readFooter()
946 // ----------------------------------------------------------------------------
947
948 /*!
949 * @fn BlastTabularFileIn#readFooter
950 * @headerfile seqan/blast.h
951 * @brief Read the footer (bottom-most section) of a BlastTabular file.
952 * @signature void readFooter(blastTabularIn);
953 *
954 * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
955 *
956 * @throw IOError On low-level I/O errors.
957 * @throw ParseError On high-level file format errors.
958 */
959
960 template <typename TFwdIterator,
961 typename TScore,
962 BlastProgram p,
963 BlastTabularSpec h>
964 inline void
readFooter(BlastIOContext<TScore,p,h> & context,TFwdIterator & iter,BlastTabular const &)965 readFooter(BlastIOContext<TScore, p, h> & context,
966 TFwdIterator & iter,
967 BlastTabular const & /*tag*/)
968 {
969 clear(context.otherLines);
970 clear(context.conformancyErrors);
971 clear(context.fieldsAsStrings);
972 clear(context.fields);
973
974 if ((context.tabularSpec == BlastTabularSpec::COMMENTS) && !context.legacyFormat)
975 {
976 if (SEQAN_UNLIKELY(!startsWith(context._lineBuffer, "# BLAST processed")))
977 {
978 std::cout << "\"" << context._lineBuffer << "\"" << std::endl;
979 SEQAN_FAIL("ERROR: Tried to read footer, but was not on footer.");
980 }
981 clear(context._stringBuffer);
982 auto it = seqan::begin(context._lineBuffer);
983 it += 18; // skip "BLAST processed "
984 readUntil(context._stringBuffer, it, IsBlank());
985
986 uint64_t numRecords = lexicalCast<uint64_t>(context._stringBuffer);
987
988 clear(context.conformancyErrors);
989 if (context._numberOfRecords < numRecords)
990 appendValue(context.conformancyErrors, "The file claims to contain more records than you read.");
991 else if (context._numberOfRecords > numRecords)
992 appendValue(context.conformancyErrors, "The file claims to contain less records than you read.");
993 }
994
995 if (!atEnd(iter))
996 appendValue(context.conformancyErrors, "Expected to be at end of file.");
997
998 _goNextLine(context, iter, BlastTabular());
999 }
1000
1001 template <typename TContext>
1002 inline void
readFooter(BlastTabularFileIn<TContext> & formattedFile)1003 readFooter(BlastTabularFileIn<TContext> & formattedFile)
1004 {
1005 readFooter(context(formattedFile), formattedFile.iter, BlastTabular());
1006 }
1007
1008 } // namespace seqan
1009
1010 #endif // SEQAN_BLAST_READ_BLAST_TABULAR_H_
1011