1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
33 // ==========================================================================
34 // This file contains routines to read BLAST tab-seperated output
35 // ==========================================================================
36 
37 #ifndef SEQAN_BLAST_BLAST_TABULAR_READ_H_
38 #define SEQAN_BLAST_BLAST_TABULAR_READ_H_
39 
40 /* IMPLEMENTATION NOTES
41 
42 BLAST TABULAR example:
43 
44 The format of a blast tabular output file is less simple than it looks, here's
45 the general form
46 
47 COMMENTLINES
48  MATCH
49  MATCH
50  MATCH
51  MATCH
52 COMMENTLINES
53  MATCH
54 COMMENTLINES
55 COMMENTLINES
56 ...
57 
58 => COMMENTLINES for each sequence, 0-n Matchs for each sequence
59 => Each record is one-line, each COMMENTLINES is multiline
60 
61 COMMENTLINES usually consists of:
62 
63 # Program Tag [e.g BLASTX 2.2.27+ ]
64 # Query Id [ID of the query *sequence*]
65 # Database Id [note that this is not the name of the sequence in the db, but of
66   the database itself, e.g. "nr" -> usually the same for each file]
67 # Fields: [Headers of columns]
68 # n "hits found"
69 
70 The first three lines are always written.
71 The Fields line is always writen by NCBI Blast, but only when hits > 0 by NCBI Blast+.
72 The "number of hits"-line is always printed by NCBI Blast+, and never by NCBI Blast.
73 
74 Possibly other lines can be written as comments.
75 
76 Because 0 matches are allowed, multiple COMMENTLINES can succeed each other, the
77 criterium for separation employed by this implementation is that an NCBI Blast
78 COMMENTLINES always ends after the "Fields" line and NCBI Blast+ COMMENTLINES end after
79 the "number of hits"-line.
80 */
81 
82 namespace seqan
83 {
84 
85 // ============================================================================
86 // Forwards
87 // ============================================================================
88 
89 // ============================================================================
90 // Tags, Classes, Enums
91 // ============================================================================
92 
93 // ----------------------------------------------------------------------------
94 // Type BlastFileIn
95 // ----------------------------------------------------------------------------
96 
97 /*!
98  * @class BlastTabularFileIn
99  * @signature template <typename TBlastIOContext>
100  * using BlastTabularFileIn = FormattedFile<BlastTabular, Input, TBlastIOContext>;
101  * @extends FormattedFileIn
102  * @headerfile <seqan/blast.h>
103  * @brief FormattedFileIn abstraction for @link BlastTabular @endlink
104  *
105  * This is a @link FormattedFile @endlink specialization for reading @link BlastTabular @endlink formats. For details
106  * on how to influence the reading of files and how to differentiate between the tabular format without comment lines
107  * and the one with comment lines, see @link BlastIOContext @endlink.
108  * Please note that you have specify the type of the context as a template parameter to BlastTabularFileIn.
109  *
110  * @section Overview
111  *
112  * <ul>
113  * <li> open @link BlastTabularFileIn @endlink</li>
114  * <li> @link BlastTabularFileIn#readHeader @endlink </li>
115  * <li> while @link BlastTabularFileIn#onRecord @endlink </li>
116  * <ul><li> @link BlastTabularFileIn#readRecord @endlink</li></ul>
117  * <li> @link BlastTabularFileIn#readFooter @endlink </li>
118  * </ul>
119  *
120  * For a detailed example have a look at the
121  * <a href="http://seqan.readthedocs.io/en/develop/Tutorial/InputOutput/BlastIO.html">Blast IO tutorial</a>.
122  *
123  * @see BlastRecord
124  */
125 
126 template <typename TBlastIOContext = BlastIOContext<>>
127 using BlastTabularFileIn = FormattedFile<BlastTabular, Input, TBlastIOContext>;
128 
129 // ============================================================================
130 // Metafunctions
131 // ============================================================================
132 
133 // ============================================================================
134 // Functions
135 // ============================================================================
136 
137 // ----------------------------------------------------------------------------
138 // Function atEnd()
139 // ----------------------------------------------------------------------------
140 
141 template <typename TScore,
142           typename TFwdIterator,
143           BlastProgram p,
144           BlastTabularSpec h>
145 inline bool
atEnd(BlastIOContext<TScore,p,h> const & context,TFwdIterator const & iter,BlastTabular const &)146 atEnd(BlastIOContext<TScore, p, h> const & context,
147       TFwdIterator const & iter,
148       BlastTabular const &)
149 {
150     return (atEnd(iter) && empty(context._lineBuffer));
151 }
152 
153 template <typename TContext>
154 inline bool
atEnd(BlastTabularFileIn<TContext> & formattedFile)155 atEnd(BlastTabularFileIn<TContext> & formattedFile)
156 {
157     return atEnd(context(formattedFile), formattedFile.iter, BlastTabular());
158 }
159 
160 // ----------------------------------------------------------------------------
161 // Function guessFormat()
162 // ----------------------------------------------------------------------------
163 
164 template <typename TSpec>
guessFormat(FormattedFile<BlastTabular,Input,TSpec> & file)165 inline bool guessFormat(FormattedFile<BlastTabular, Input, TSpec> & file)
166 {
167     if (value(file.iter) == '#')
168         context(file).tabularSpec = BlastTabularSpec::COMMENTS;
169     else
170         context(file).tabularSpec = BlastTabularSpec::NO_COMMENTS;
171     return true;
172 }
173 
174 // ----------------------------------------------------------------------------
175 // Function _onMatch()
176 // ----------------------------------------------------------------------------
177 
178 template <typename TScore,
179           BlastProgram p,
180           BlastTabularSpec h>
181 inline bool
_onMatch(BlastIOContext<TScore,p,h> & context,BlastTabular const &)182 _onMatch(BlastIOContext<TScore, p, h> & context,
183         BlastTabular const &)
184 {
185     return (!startsWith(context._lineBuffer, "#") && !empty(context._lineBuffer));
186 }
187 
188 // ----------------------------------------------------------------------------
189 // Function onRecord()
190 // ----------------------------------------------------------------------------
191 
192 /*!
193  * @fn BlastTabularFileIn#onRecord
194  * @brief Returns whether the currently buffered line looks like the start of a record.
195  * @signature bool onRecord(blastTabularIn);
196  * @headerfile seqan/blast.h
197  *
198  * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
199  *
200  * @throw IOError On low-level I/O errors.
201  * @throw ParseError On high-level file format errors.
202  *
203  * @return bool true or false
204  */
205 
206 template <typename TScore,
207           BlastProgram p,
208           BlastTabularSpec h>
209 inline bool
onRecord(BlastIOContext<TScore,p,h> & context,BlastTabular const &)210 onRecord(BlastIOContext<TScore, p, h> & context,
211          BlastTabular const &)
212 {
213     if (context.tabularSpec == BlastTabularSpec::NO_COMMENTS)
214         return _onMatch(context, BlastTabular());
215 
216     //      comment lines                                  Footer
217     return (startsWith(context._lineBuffer, "#") && (!startsWith(context._lineBuffer, "# BLAST processed ")));
218 }
219 
220 template <typename TContext>
221 inline bool
onRecord(BlastTabularFileIn<TContext> & formattedFile)222 onRecord(BlastTabularFileIn<TContext> & formattedFile)
223 {
224     return onRecord(context(formattedFile), BlastTabular());
225 }
226 
227 // ----------------------------------------------------------------------------
228 // Function _goNextLine()
229 // ----------------------------------------------------------------------------
230 
231 template <typename TScore,
232           typename TFwdIterator,
233           BlastProgram p,
234           BlastTabularSpec h>
235 inline void
_goNextLine(BlastIOContext<TScore,p,h> & context,TFwdIterator & iter,BlastTabular const &)236 _goNextLine(BlastIOContext<TScore, p, h> & context,
237            TFwdIterator & iter,
238            BlastTabular const &)
239 {
240     clear(context._lineBuffer);
241     if (!atEnd(iter))
242         readLine(context._lineBuffer, iter);
243 }
244 
245 // ----------------------------------------------------------------------------
246 // Function _readCommentLines()
247 // ----------------------------------------------------------------------------
248 
249 template <typename ... TSpecs,
250           typename TFwdIterator,
251           typename TScore,
252           BlastProgram p,
253           BlastTabularSpec h>
254 inline void
_readCommentLinesImpl(BlastRecord<TSpecs...> & r,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)255 _readCommentLinesImpl(BlastRecord<TSpecs...> & r,
256                       TFwdIterator & iter,
257                       BlastIOContext<TScore, p, h> & context,
258                       BlastTabular const &)
259 {
260     // this is a match instead of comment lines
261     if (_onMatch(context, BlastTabular()))
262         SEQAN_THROW(ParseError("Commen lines expected, but not found."));
263     else
264         context.tabularSpec = BlastTabularSpec::COMMENTS;
265 
266     int queryLinePresent = 0;
267     int dbLinePresent = 0;
268     int fieldsLinePresent = 0;
269     int hitsLinePresent = 0;
270 
271     do
272     {
273         if (std::regex_search(std::begin(context._lineBuffer),
274                               std::end(context._lineBuffer),
275                               std::regex("^# T?BLAST")))
276         {
277             // last line of file
278             if (SEQAN_UNLIKELY(startsWith(context._lineBuffer, "# BLAST processed ") && !context.legacyFormat))
279             {
280                 SEQAN_FAIL("ERROR: You called readRecord() when you should have called readFooter()."
281                            "Always check onRecord() before calling readRecord().");
282             }
283             else // first line of the comments
284             {
285                 assign(context.versionString, suffix(context._lineBuffer, 2));
286                 context.blastProgram = _programStringToTag(prefix(context.versionString,
287                                                                   std::find(begin(context.versionString, Standard()),
288                                                                             end(context.versionString, Standard()),
289                                                                             ' ')
290                                                                   - begin(context.versionString, Standard())));
291 
292                 context.legacyFormat = !std::regex_search(std::begin(context.versionString),
293                                                           std::end(context.versionString),
294                                                           std::regex("\\d\\.\\d\\.\\d{1,2}\\+"));
295             }
296         }
297         else if (startsWith(context._lineBuffer, "# Query:"))
298         {
299             ++queryLinePresent;
300             assign(r.qId, suffix(context._lineBuffer, 9));
301 
302         }
303         else if (startsWith(context._lineBuffer, "# Database:"))
304         {
305             ++dbLinePresent;
306             assign(context.dbName, suffix(context._lineBuffer, 12));
307         }
308         else if (startsWith(context._lineBuffer, "# Fields:"))
309         {
310             ++fieldsLinePresent;
311             clear(context._stringBuffer);
312             // make sure target is big enough
313             resize(context._stringBuffer, length(context._lineBuffer) - 10);
314             // use escape character as placeholder for replacement since strSplit only handles single characters
315             std::regex_replace(std::begin(context._stringBuffer),
316                                std::begin(context._lineBuffer) + 10, // skip "# Fields:"
317                                std::end(context._lineBuffer),
318                                std::regex(", "),
319                                "\x7F");
320             // shrink back down to actual size (replacing two letters with one makes string shorter!)
321             resize(context._stringBuffer, length(context._stringBuffer.c_str()));
322             strSplit(context.fieldsAsStrings, context._stringBuffer, EqualsChar<'\x7F'>(), true);
323 
324             if (context.legacyFormat)
325             {
326                 // assume defaults for LEGACY
327                 if (!context.ignoreFieldsInComments)
328                     appendValue(context.fields, BlastMatchField<>::Enum::STD);
329             }
330             else if (!context.ignoreFieldsInComments)
331             {
332                 bool defaults = true;
333                 resize(context.fields, length(context.fieldsAsStrings));
334                 for (uint8_t j = 0; j < length(context.fieldsAsStrings); ++j)
335                 {
336                     for (uint8_t i = 0; i < length(BlastMatchField<>::columnLabels); ++i)
337                     {
338                         if (context.fieldsAsStrings[j] ==
339                             BlastMatchField<>::columnLabels[i])
340                         {
341                             context.fields[j] =
342                                 static_cast<BlastMatchField<>::Enum>(i);
343 
344                             if ((j >= length(BlastMatchField<>::defaults)) &&
345                                 (static_cast<BlastMatchField<>::Enum>(i) !=
346                                 BlastMatchField<>::defaults[j]))
347                                 defaults = false;
348                             break;
349                         }
350                     }
351                 }
352                 if (defaults) // replace multiple fields with the default meta field
353                 {
354                     clear(context.fields);
355                     appendValue(context.fields, BlastMatchField<>::Enum::STD);
356                 }
357             }
358         }
359         else
360         {
361             if (!context.legacyFormat)
362             {
363                 // is hits counter?
364                 if (endsWith(context._lineBuffer, "hits found"))
365                 {
366                     clear(context._stringBuffer);
367                     for (unsigned i = 2; (i < length(context._lineBuffer) && isdigit(context._lineBuffer[i])); ++i)
368                         appendValue(context._stringBuffer, context._lineBuffer[i], Generous());
369 
370                     uint64_t hits = lexicalCast<uint64_t>(context._stringBuffer);
371 
372                     if (hits)
373                     {
374                         resize(r.matches, hits);
375                     }
376                     else  // hits = 0 means no fieldList, restore default
377                     {
378                         appendValue(context.fields, BlastMatchField<>::Enum::STD);
379 
380                         context._stringBuffer = std::regex_replace(BlastMatchField<>::columnLabels[0],
381                                                                    std::regex(", "),
382                                                                    "\x7F");
383                         strSplit(context.fieldsAsStrings, context._stringBuffer, EqualsChar<'\x7F'>(), true);
384                     }
385 
386                     ++hitsLinePresent;
387 //                     break; // comments are finished
388                 }
389                 else
390                 {
391                     appendValue(context.otherLines, suffix(context._lineBuffer, 1), Generous());
392                 }
393             }
394             else
395             {
396                 appendValue(context.otherLines, suffix(context._lineBuffer, 1), Generous());
397             }
398         }
399 
400         _goNextLine(context, iter, BlastTabular());
401 
402     } while (startsWith(context._lineBuffer, "#") &&                   // still on comments
403              !std::regex_search(std::begin(context._lineBuffer),// start of next record
404                                 std::end(context._lineBuffer),
405                                 std::regex("^# T?BLAST")));
406 
407     if (context.blastProgram == BlastProgram::UNKNOWN)
408         appendValue(context.conformancyErrors,
409                     "Type of BlastProgram could not be determined from comments, you are advised to look "
410                     "at context.versionString and context.otherLines.");
411 
412     if (queryLinePresent != 1)
413         appendValue(context.conformancyErrors, "No or multiple query lines present.");
414 
415     if (dbLinePresent != 1)
416         appendValue(context.conformancyErrors, "No or multiple database lines present.");
417 
418     if (context.legacyFormat)
419     {
420         if (fieldsLinePresent != 1)
421             appendValue(context.conformancyErrors, "No or multiple fields lines present.");
422     }
423     else
424     {
425         // is omitted in BLAST_PLUS when there are no hits
426         if ((fieldsLinePresent != 1) && (length(r.matches) > 0))
427             appendValue(context.conformancyErrors, "No or multiple fields lines present.");
428     }
429 
430     if (!empty(context.otherLines))
431         appendValue(context.conformancyErrors, "Unexpected lines present, see context.otherLines.");
432 }
433 
434 template <typename ... TSpecs,
435           typename TFwdIterator,
436           typename TScore,
437           BlastProgram p,
438           BlastTabularSpec h>
439 inline void
_readCommentLines(BlastRecord<TSpecs...> & r,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)440 _readCommentLines(BlastRecord<TSpecs...> & r,
441                   TFwdIterator & iter,
442                   BlastIOContext<TScore, p, h> & context,
443                   BlastTabular const &)
444 {
445     ++context._numberOfRecords;
446 
447     if (context.tabularSpec == BlastTabularSpec::NO_COMMENTS)
448         return;
449 
450     clear(r);
451     clear(context.versionString);
452     clear(context.dbName);
453     clear(context.otherLines);
454     clear(context.conformancyErrors);
455     clear(context.fieldsAsStrings);
456 
457     if (!context.ignoreFieldsInComments)
458         clear(context.fields);
459 
460     _readCommentLinesImpl(r, iter, context, BlastTabular());
461 }
462 
463 // ----------------------------------------------------------------------------
464 // Function _readField()
465 // ----------------------------------------------------------------------------
466 
467 template <typename TAlignRow0,
468           typename TAlignRow1,
469           typename TPos,
470           typename TQId,
471           typename TSId,
472           typename TScore,
473           typename ... TSpecs,
474           BlastProgram p,
475           BlastTabularSpec h>
476 inline void
_readField(BlastMatch<TAlignRow0,TAlignRow1,TPos,TQId,TSId> & match,BlastRecord<TSpecs...> & record,BlastIOContext<TScore,p,h> & context,typename BlastMatchField<>::Enum const fieldId)477 _readField(BlastMatch<TAlignRow0, TAlignRow1, TPos, TQId, TSId> & match,
478            BlastRecord<TSpecs...> & record,
479            BlastIOContext<TScore, p, h> & context,
480            typename BlastMatchField<>::Enum const fieldId)
481 {
482     switch (fieldId)
483     {
484         case BlastMatchField<>::Enum::STD: // this is cought in the calling function
485             break;
486         case BlastMatchField<>::Enum::Q_SEQ_ID:
487             match.qId = context._stringBuffer;
488             break;
489 //         case ENUM::Q_GI: write(s,  * ); break;
490         case BlastMatchField<>::Enum::Q_ACC:
491             appendValue(record.qAccs, context._stringBuffer);
492             break;
493 //         case ENUM::Q_ACCVER: write(s,  * ); break;
494         case BlastMatchField<>::Enum::Q_LEN:
495             match.qLength = lexicalCast<TPos>(context._stringBuffer);
496             record.qLength = match.qLength;
497             break;
498         case BlastMatchField<>::Enum::S_SEQ_ID:
499             match.sId = context._stringBuffer;
500             break;
501 //         case ENUM::S_ALL_SEQ_ID: write(s,  * ); break;
502 //         case ENUM::S_GI: write(s,  * ); break;
503 //         case ENUM::S_ALL_GI: write(s,  * ); break;
504         case BlastMatchField<>::Enum::S_ACC:
505             appendValue(match.sAccs, context._stringBuffer);
506             break;
507 //         case ENUM::S_ACCVER: write(s,  * ); break;
508         case BlastMatchField<>::Enum::S_ALLACC:
509             strSplit(match.sAccs, context._stringBuffer, EqualsChar<';'>());
510             break;
511         case BlastMatchField<>::Enum::S_LEN:
512             match.sLength = lexicalCast<TPos>(context._stringBuffer);
513             break;
514         case BlastMatchField<>::Enum::Q_START:
515             match.qStart = lexicalCast<TPos>(context._stringBuffer);
516             break;
517         case BlastMatchField<>::Enum::Q_END:
518             match.qEnd = lexicalCast<TPos>(context._stringBuffer);
519             break;
520         case BlastMatchField<>::Enum::S_START:
521             match.sStart = lexicalCast<TPos>(context._stringBuffer);
522             break;
523         case BlastMatchField<>::Enum::S_END:
524             match.sEnd = lexicalCast<TPos>(context._stringBuffer);
525             break;
526 //         case ENUM::Q_SEQ: write(s,  * ); break;
527 //         case ENUM::S_SEQ: write(s,  * ); break;
528         case BlastMatchField<>::Enum::E_VALUE:
529             match.eValue = lexicalCast<double>(context._stringBuffer);
530             break;
531         case BlastMatchField<>::Enum::BIT_SCORE:
532             match.bitScore = lexicalCast<double>(context._stringBuffer);
533             break;
534         case BlastMatchField<>::Enum::SCORE:
535             match.alignStats.alignmentScore = lexicalCast<TPos>(context._stringBuffer);
536             break;
537         case BlastMatchField<>::Enum::LENGTH:
538             match.alignStats.alignmentLength = lexicalCast<TPos>(context._stringBuffer);
539             break;
540         case BlastMatchField<>::Enum::P_IDENT:
541             match.alignStats.alignmentIdentity = lexicalCast<double>(context._stringBuffer);
542             break;
543         case BlastMatchField<>::Enum::N_IDENT:
544             match.alignStats.numMatches = lexicalCast<TPos>(context._stringBuffer);
545             break;
546         case BlastMatchField<>::Enum::MISMATCH:
547             match.alignStats.numMismatches = lexicalCast<TPos>(context._stringBuffer);
548             break;
549         case BlastMatchField<>::Enum::POSITIVE:
550             match.alignStats.numPositiveScores = lexicalCast<TPos>(context._stringBuffer);
551             break;
552         case BlastMatchField<>::Enum::GAP_OPEN:
553             match.alignStats.numGapOpens = lexicalCast<TPos>(context._stringBuffer);
554             break;
555         case BlastMatchField<>::Enum::GAPS:
556             match.alignStats.numGaps = lexicalCast<TPos>(context._stringBuffer);
557             break;
558         case BlastMatchField<>::Enum::P_POS:
559             match.alignStats.alignmentSimilarity = lexicalCast<double>(context._stringBuffer);
560             break;
561         case BlastMatchField<>::Enum::FRAMES:
562         {
563             clear(context._setBuffer2);
564             strSplit(context._setBuffer2, context._stringBuffer, EqualsChar<'/'>());
565             if (length(context._setBuffer2) != 2)
566                 SEQAN_THROW(ParseError("Could not process frame string."));
567             match.qFrameShift = lexicalCast<int8_t>(context._setBuffer2[0]);
568             match.sFrameShift = lexicalCast<int8_t>(context._setBuffer2[1]);
569         } break;
570         case BlastMatchField<>::Enum::Q_FRAME:
571             match.qFrameShift = lexicalCast<int8_t>(context._stringBuffer);
572             break;
573         case BlastMatchField<>::Enum::S_FRAME:
574             match.sFrameShift = lexicalCast<int8_t>(context._stringBuffer);
575             break;
576 //         case ENUM::BTOP: write( * ); break;
577         case BlastMatchField<>::Enum::S_TAX_IDS:
578         {
579             StringSet<CharString> temp;
580             strSplit(temp, context._stringBuffer, EqualsChar<';'>());
581             for (auto const & s : temp)
582                 appendValue(match.sTaxIds, lexicalCast<uint64_t>(s));
583         } break;
584 //         case ENUM::S_SCI_NAMES: write( * ); break;
585 //         case ENUM::S_COM_NAMES: write( * ); break;
586 //         case ENUM::S_BLAST_NAMES: write( * ); break;
587 //         case ENUM::S_S_KINGDOMS: write( * ); break;
588 //         case ENUM::S_TITLE: write( * ); break;
589 //         case ENUM::S_ALL_TITLES: write( * ); break;
590 //         case ENUM::S_STRAND: write( * ); break;
591 //         case ENUM::Q_COV_S: write( * ); break;
592 //         case ENUM::Q_COV_HSP:
593         case BlastMatchField<>::Enum::LCA_ID:
594             record.lcaId = context._stringBuffer;
595             break;
596         case BlastMatchField<>::Enum::LCA_TAX_ID:
597             record.lcaTaxId = lexicalCast<uint32_t>(context._stringBuffer);
598             break;
599         default:
600             SEQAN_THROW(ParseError("The requested column type is not yet "
601                                    "implemented."));
602     };
603 }
604 
605 // ----------------------------------------------------------------------------
606 // Function _readMatch()
607 // ----------------------------------------------------------------------------
608 
609 template <typename TAlignRow0,
610           typename TAlignRow1,
611           typename TPos,
612           typename TQId,
613           typename TSId,
614           typename TFwdIterator,
615           typename TScore,
616           typename ... TSpecs,
617           BlastProgram p,
618           BlastTabularSpec h>
619 inline void
_readMatch(BlastMatch<TAlignRow0,TAlignRow1,TPos,TQId,TSId> & match,BlastRecord<TSpecs...> & record,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)620 _readMatch(BlastMatch<TAlignRow0, TAlignRow1, TPos, TQId, TSId> & match,
621            BlastRecord<TSpecs...> & record,
622            TFwdIterator & iter,
623            BlastIOContext<TScore, p, h> & context,
624            BlastTabular const &)
625 {
626     if (context.legacyFormat)
627     {
628         #if SEQAN_ENABLE_DEBUG
629         if ((length(context.fields) != 1) || (context.fields[0] != BlastMatchField<>::Enum::STD))
630             std::cerr << "Warning: custom fields set, but will be reset, because legacyFormat is also set.\n";
631         #endif
632         // set defaults
633         clear(context.fields);
634         appendValue(context.fields,  BlastMatchField<>::Enum::STD);
635     }
636 
637     // comments should have been read or skipped
638     if (SEQAN_UNLIKELY(!_onMatch(context, BlastTabular())))
639         SEQAN_THROW(ParseError("Not on beginning of Match (you should have skipped comments)."));
640 
641     match._maxInitialize(); // mark all members as "not set"
642 
643     clear(context._setBuffer1);
644 
645     auto & fields = context._setBuffer1;
646     strSplit(fields, context._lineBuffer, IsTab());
647 
648     // line in buffer was processed so new line is read from iter into buffer
649     _goNextLine(context, iter, BlastTabular());
650 
651     unsigned n = 0u;
652     for (BlastMatchField<>::Enum const f : context.fields)
653     {
654         // this field represents multiple fields
655         if (f == BlastMatchField<>::Enum::STD)
656         {
657             for (typename BlastMatchField<>::Enum const f2 : BlastMatchField<>::defaults)
658             {
659                 if (SEQAN_UNLIKELY(n >= length(fields)))
660                     SEQAN_THROW(ParseError("More columns expected than were present in file."));
661 
662                 context._stringBuffer = static_cast<decltype(context._stringBuffer)>(fields[n++]);
663                 _readField(match, record, context, f2);
664             }
665         } else
666         {
667             if (SEQAN_UNLIKELY(n >= length(fields)))
668                 SEQAN_THROW(ParseError("More columns expected than were present in file."));
669 
670             context._stringBuffer = static_cast<decltype(context._stringBuffer)>(fields[n++]);
671             _readField(match, record, context, f);
672         }
673     }
674 
675     // retransform the percentages to real numbers
676     if (!_memberIsSet(match.alignStats.numMatches) &&
677         _memberIsSet(match.alignStats.alignmentLength) &&
678         _memberIsSet(match.alignStats.alignmentIdentity))
679         match.alignStats.numMatches = std::lround(double(match.alignStats.alignmentLength) *
680                                             match.alignStats.alignmentIdentity / 100.0);
681     if (!_memberIsSet(match.alignStats.numPositiveScores) &&
682         _memberIsSet(match.alignStats.alignmentLength) &&
683         _memberIsSet(match.alignStats.alignmentSimilarity))
684         match.alignStats.numPositiveScores = std::lround(double(match.alignStats.alignmentLength) *
685                                             match.alignStats.alignmentSimilarity / 100.0);
686 
687     //TODO also transform numbers to percentages; possibly do other calculations
688 
689     // and compute gaps from other values
690     // since gaps are included in the mismatch count in BLAST_LEGACY they cannot be computed here
691     if (!context.legacyFormat &&
692         !_memberIsSet(match.alignStats.numGaps) &&
693         _memberIsSet(match.alignStats.alignmentLength) &&
694         _memberIsSet(match.alignStats.numMatches) &&
695         _memberIsSet(match.alignStats.numMismatches))
696         match.alignStats.numGaps = match.alignStats.alignmentLength -
697                                    match.alignStats.numMatches -
698                                    match.alignStats.numMismatches;
699 }
700 
701 // ----------------------------------------------------------------------------
702 // Function readRecord()
703 // ----------------------------------------------------------------------------
704 
705 template <typename TFwdIterator,
706           typename ... TSpecs,
707           typename TScore,
708           BlastProgram p,
709           BlastTabularSpec h>
710 inline void
_readRecordWithCommentLines(BlastRecord<TSpecs...> & blastRecord,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)711 _readRecordWithCommentLines(BlastRecord<TSpecs...> & blastRecord,
712                             TFwdIterator & iter,
713                             BlastIOContext<TScore, p, h> & context,
714                             BlastTabular const &)
715 {
716     _readCommentLines(blastRecord, iter, context, BlastTabular());
717 
718     if (!context.legacyFormat) // this is detected from the comments
719     {
720         // .matches already resized for us
721         for (auto & m : blastRecord.matches)
722         {
723             if (!_onMatch(context, BlastTabular()))
724             {
725                 appendValue(context.conformancyErrors,
726                             "Less matches than promised by the comments");
727                 break;
728             }
729 
730             _readMatch(m, blastRecord, iter, context, BlastTabular());
731         }
732 
733         if ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
734             appendValue(context.conformancyErrors,
735                         "More matches than promised by the comments");
736 
737         while ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
738         {
739             blastRecord.matches.emplace_back();
740             _readMatch(back(blastRecord.matches), blastRecord, iter, context, BlastTabular());
741         }
742     } else
743     {
744         while ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
745         {
746             blastRecord.matches.emplace_back();
747             _readMatch(back(blastRecord.matches), blastRecord, iter, context, BlastTabular());
748         }
749     }
750 }
751 
752 template <typename ... TSpecs,
753           typename TFwdIterator,
754           typename TScore,
755           BlastProgram p,
756           BlastTabularSpec h>
757 inline void
_readRecordWithoutCommentLines(BlastRecord<TSpecs...> & blastRecord,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)758 _readRecordWithoutCommentLines(BlastRecord<TSpecs...> & blastRecord,
759                                TFwdIterator & iter,
760                                BlastIOContext<TScore, p, h> & context,
761                                BlastTabular const &)
762 {
763     ++context._numberOfRecords;
764 
765     clear(blastRecord);
766 
767     auto it = begin(context._lineBuffer, Rooted()); // move into line below when seqan supports && properly
768     readUntil(blastRecord.qId, it, IsTab());
769 
770     auto curIdPlusTab = blastRecord.qId;
771     appendValue(curIdPlusTab, '\t');
772 
773     if ((context.fields[0] != BlastMatchField<>::Enum::STD) &&
774         (context.fields[0] != BlastMatchField<>::Enum::Q_SEQ_ID))
775     {
776         SEQAN_FAIL("ERROR: readRecord interface on comment-less format with custom "
777                    "fields not supported, unless first custom field is "
778                    "Q_SEQ_ID. Use the lowlevel readMatch interface instead.");
779     }
780 
781     while ((!atEnd(context, iter, BlastTabular())) && _onMatch(context, BlastTabular()))
782     {
783         blastRecord.matches.emplace_back();
784         // read remainder of line
785         _readMatch(back(blastRecord.matches), blastRecord, iter, context, BlastTabular());
786 
787         if (!startsWith(context._lineBuffer, curIdPlusTab)) // next record reached
788             break;
789     }
790 
791     if (empty(blastRecord.matches))
792         SEQAN_THROW(ParseError("No Matches could be read."));
793 }
794 
795 /*!
796  * @fn BlastTabularFileIn#readRecord
797  * @brief Read a record from a file in BlastTabular format.
798  * @headerfile seqan/blast.h
799  * @signature void readRecord(blastRecord, blastTabularIn);
800  *
801  * @param[out]    blastRecord  A @link BlastRecord @endlink to hold all information related to one query sequence.
802  * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
803  *
804  * @section Remarks
805  *
806  * This function will read an entire record from a blast tabular file, i.e. it will read the comment lines
807  * (if the format is @link BlastTabularSpec::COMMENTS @endlink) and 0-n @link BlastMatch @endlinkes belonging
808  * to one query.
809  *
810  * Please note that if there are no comment lines in the file the boundary
811  * between records is inferred from the indentity of the first field, i.e.
812  * non-standard field configurations must also have Q_SEQ_ID as their first @link BlastMatchField @endlink.
813  *
814  * @subsection Comment lines
815  *
816  * The @link BlastRecord::qId @endlink member of the record is read from the comment lines and the
817  * @link BlastRecord::matches @endlink are resized to the expected number of matches succeeding the comments.
818  *
819  * This function also sets many properties of blastTabularIn's @link BlastIOContext @endlink, including these members:
820  * <li> @link BlastIOContext::versionString @endlink: version string of the program.</li>
821  * <li> @link BlastIOContext::dbName @endlink: name of the database.</li>
822  * <li> @link BlastIOContext::fields @endlink: descriptors for the columns.</li>
823  * <li> @link BlastIOContext::fieldsAsStrings @endlink: labels of the columns
824  * as they appear in the file.</li>
825  * <li> @link BlastIOContext::conformancyErrors @endlink: if this StringSet is not empty, then there are issues in the
826  * comments.</li>
827  * <li> @link BlastIOContext::otherLines @endlink: any lines that cannot be interpreted; these always also imply
828  * conformancyErrors.</li>
829  * <li>@link BlastIOContext::legacyFormat @endlink: whether the record (and likely the entire file) is in
830  * legacyFormat.</li>
831  *
832  * It also sets the blast program run-time parameter of the context depending on the information found in the comments. If
833  * the compile time parameter was set on the context and they are different this will result in a critical error.
834  *
835  * Please note that for @link BlastIOContext::legacyFormat @endlink the @link BlastIOContext::fields @endlink member
836  * is always ignored, however @link BlastIOContext::fieldsAsStrings @endlink is still read from the comments, in case
837  * you want to process it.
838  *
839  * In case you do not wish the @link BlastIOContext::fields @endlink to be read from the comments, you can set
840  * context.@link BlastIOContext::ignoreFieldsInComments @endlink to true. This will be prevent it from being read and will
841  * allow you to specify it manually which might be relevant for reading the match lines.
842  *
843  * If the format is @link BlastTabularSpec::NO_COMMENTS @endlink none of the above happens and
844  * @link BlastRecord::qId @endlink is derived from the first match.
845  *
846  * @subsection Matches
847  *
848  * A match line contains 1 - n columns or fields, 12 by default.
849  * The @link BlastIOContext::fields @endlink member of the context is considered when reading these fields. It is
850  * usually extracted from the comment lines but can also be set by yourself if there are no comments or if you want
851  * to overwrite the comments' information (see above).
852  * You may specify less
853  * fields than are actually present, in this case the additional fields will be
854  * discarded. The parameter is ignored if @link BlastIOContext::legacyFormat @endlink is set.
855  *
856  * To differentiate between members of a @link BlastMatch @endlink that were read from the file and those that have
857  * not been set (e.g. both could be 0), the latter are initialized to their respective max-values.
858  *
859  * Please note that the only transformations made to the data are the following:
860  *
861  *  <li> computation of the number of identities (from the percentage) [default]</li>
862  *  <li> computation of the number of positives (from the percentage) [if given]</li>
863  *  <li> number of gaps computed from other values [default]</li>
864  *
865  * In contrast to @link BlastTabularFileOut#writeRecord @endlink no other transformations
866  * are made, e.g. the positions are still one-indexed and
867  * flipped for reverse strand matches. This is due to the required fields for
868  * retransformation (sequence lengths, frames) not being available in the
869  * default columns.
870  *
871  * @throw IOError On low-level I/O errors.
872  * @throw ParseError On high-level file format errors.
873  */
874 
875 template <typename ... TSpecs,
876           typename TFwdIterator,
877           typename TScore,
878           BlastProgram p,
879           BlastTabularSpec h>
880 inline void
readRecord(BlastRecord<TSpecs...> & blastRecord,TFwdIterator & iter,BlastIOContext<TScore,p,h> & context,BlastTabular const &)881 readRecord(BlastRecord<TSpecs...> & blastRecord,
882            TFwdIterator & iter,
883            BlastIOContext<TScore, p, h> & context,
884            BlastTabular const &)
885 {
886     if (context.tabularSpec == BlastTabularSpec::NO_COMMENTS)
887         _readRecordWithoutCommentLines(blastRecord, iter, context, BlastTabular());
888     else
889         _readRecordWithCommentLines(blastRecord, iter, context, BlastTabular());
890 }
891 
892 template <typename ... TSpecs,
893           typename TContext>
894 inline void
readRecord(BlastRecord<TSpecs...> & blastRecord,BlastTabularFileIn<TContext> & formattedFile)895 readRecord(BlastRecord<TSpecs...> & blastRecord,
896            BlastTabularFileIn<TContext> & formattedFile)
897 {
898     readRecord(blastRecord, formattedFile.iter, context(formattedFile), BlastTabular());
899 }
900 
901 // ----------------------------------------------------------------------------
902 // Function readHeader()
903 // ----------------------------------------------------------------------------
904 
905 /*!
906  * @fn BlastTabularFileIn#readHeader
907  * @headerfile seqan/blast.h
908  * @brief Read the header (top-most section) of a BlastTabular file.
909  * @signature void readHeader(blastTabularIn);
910  *
911  * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
912  *
913  * @throw IOError On low-level I/O errors.
914  * @throw ParseError On high-level file format errors.
915  */
916 
917 template <typename TFwdIterator,
918           typename TScore,
919           BlastProgram p,
920           BlastTabularSpec h>
921 inline void
readHeader(BlastIOContext<TScore,p,h> & context,TFwdIterator & iter,BlastTabular const &)922 readHeader(BlastIOContext<TScore, p, h> & context,
923            TFwdIterator & iter,
924            BlastTabular const & /*tag*/)
925 {
926     readLine(context._lineBuffer, iter); // fill the line-buffer the first time
927 
928     if (context.tabularSpec == BlastTabularSpec::UNKNOWN)
929     {
930         if (_onMatch(context, BlastTabular()))
931             context.tabularSpec = BlastTabularSpec::NO_COMMENTS;
932         else
933             context.tabularSpec = BlastTabularSpec::COMMENTS;
934     }
935 }
936 
937 template <typename TContext>
938 inline void
readHeader(BlastTabularFileIn<TContext> & formattedFile)939 readHeader(BlastTabularFileIn<TContext> & formattedFile)
940 {
941     readHeader(context(formattedFile), formattedFile.iter, BlastTabular());
942 }
943 
944 // ----------------------------------------------------------------------------
945 // Function readFooter()
946 // ----------------------------------------------------------------------------
947 
948 /*!
949  * @fn BlastTabularFileIn#readFooter
950  * @headerfile seqan/blast.h
951  * @brief Read the footer (bottom-most section) of a BlastTabular file.
952  * @signature void readFooter(blastTabularIn);
953  *
954  * @param[in,out] blastTabularIn A @link BlastTabularFileIn @endlink formattedFile.
955  *
956  * @throw IOError On low-level I/O errors.
957  * @throw ParseError On high-level file format errors.
958  */
959 
960 template <typename TFwdIterator,
961           typename TScore,
962           BlastProgram p,
963           BlastTabularSpec h>
964 inline void
readFooter(BlastIOContext<TScore,p,h> & context,TFwdIterator & iter,BlastTabular const &)965 readFooter(BlastIOContext<TScore, p, h> & context,
966            TFwdIterator & iter,
967            BlastTabular const & /*tag*/)
968 {
969     clear(context.otherLines);
970     clear(context.conformancyErrors);
971     clear(context.fieldsAsStrings);
972     clear(context.fields);
973 
974     if ((context.tabularSpec == BlastTabularSpec::COMMENTS) && !context.legacyFormat)
975     {
976         if (SEQAN_UNLIKELY(!startsWith(context._lineBuffer, "# BLAST processed")))
977         {
978             std::cout << "\"" << context._lineBuffer << "\"" << std::endl;
979             SEQAN_FAIL("ERROR: Tried to read footer, but was not on footer.");
980         }
981         clear(context._stringBuffer);
982         auto it = seqan::begin(context._lineBuffer);
983         it += 18; // skip "BLAST processed "
984         readUntil(context._stringBuffer, it,  IsBlank());
985 
986         uint64_t numRecords = lexicalCast<uint64_t>(context._stringBuffer);
987 
988         clear(context.conformancyErrors);
989         if (context._numberOfRecords < numRecords)
990             appendValue(context.conformancyErrors, "The file claims to contain more records than you read.");
991         else if (context._numberOfRecords > numRecords)
992             appendValue(context.conformancyErrors, "The file claims to contain less records than you read.");
993     }
994 
995     if (!atEnd(iter))
996         appendValue(context.conformancyErrors, "Expected to be at end of file.");
997 
998     _goNextLine(context, iter, BlastTabular());
999 }
1000 
1001 template <typename TContext>
1002 inline void
readFooter(BlastTabularFileIn<TContext> & formattedFile)1003 readFooter(BlastTabularFileIn<TContext> & formattedFile)
1004 {
1005     readFooter(context(formattedFile), formattedFile.iter, BlastTabular());
1006 }
1007 
1008 } // namespace seqan
1009 
1010 #endif // SEQAN_BLAST_READ_BLAST_TABULAR_H_
1011