1 // ==========================================================================
2 //                 SeqAn - The Library for Sequence Analysis
3 // ==========================================================================
4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin
5 // All rights reserved.
6 //
7 // Redistribution and use in source and binary forms, with or without
8 // modification, are permitted provided that the following conditions are met:
9 //
10 //     * Redistributions of source code must retain the above copyright
11 //       notice, this list of conditions and the following disclaimer.
12 //     * Redistributions in binary form must reproduce the above copyright
13 //       notice, this list of conditions and the following disclaimer in the
14 //       documentation and/or other materials provided with the distribution.
15 //     * Neither the name of Knut Reinert or the FU Berlin nor the names of
16 //       its contributors may be used to endorse or promote products derived
17 //       from this software without specific prior written permission.
18 //
19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE
23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH
29 // DAMAGE.
30 //
31 // ==========================================================================
32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de>
33 // ==========================================================================
34 // This file contains the BlastIOContext's code
35 // ==========================================================================
36 
37 #ifndef SEQAN_BLAST_BLAST_IO_CONTEXT_H__
38 #define SEQAN_BLAST_BLAST_IO_CONTEXT_H__
39 
40 namespace seqan
41 {
42 
43 // ============================================================================
44 // Forwards
45 // ============================================================================
46 
47 template <typename TScore>
48 struct BlastScoringScheme;
49 
50 // ============================================================================
51 // Metafunctions
52 // ============================================================================
53 
54 // ----------------------------------------------------------------------------
55 // Mfn BlastIOContextStringType_
56 // ----------------------------------------------------------------------------
57 
58 template <typename TContext>
59 struct BlastIOContextStringType_
60 {
61     typedef std::string Type;
62 };
63 
64 // ============================================================================
65 // Tags, Classes, Enums
66 // ============================================================================
67 
68 // ----------------------------------------------------------------------------
69 // Class BlastIOContext
70 // ----------------------------------------------------------------------------
71 
72 /*!
73  * @class BlastIOContext
74  * @headerfile <seqan/blast.h>
75  * @signature template <typename TScore_ = Blosum62,
76  * BlastProgram p = BlastProgram::UNKNOWN, BlastTabularSpec h = BlastTabularSpec::UNKNOWN>
77  * struct BlastIOContext { ... };
78  *
79  * @brief An object that holds file global information and buffers for BlastIO
80  *
81  * @tparam TScore   Type of the @link Score @endlink object used.
82  * @tparam p        @link BlastProgram @endlink as compile-time parameter.
83  * @tparam h        @link BlastTabularSpec @endlink as compile-time parameter.
84  *
85  * This is a part of the Blast formatted files. Before writing, some of the context's members should be set; after
86  * reading it will contain
87  * all information from the file that did not belong to a @link BlastRecord @endlink, e.g. the name of the database.
88  * It also contains buffers for internal use.
89  *
90  * You should re-use this object (i.e. only create it once for
91  * every file that you read/write). And you don't need to and should not clear()
92  * this, except when restarting IO on a different file.
93  *
94  * To speed-up file writing slightly you can set the value template parameters <tt>p</tt> and/or <tt>h</tt> to something
95  * other than ::DYNAMIC at compile-time (e.g. if you know that you will be printing only BLASTX), but then you won't
96  * be able to modify these values at run-time. For file reading this is also possible, but usually the
97  * added flexibility of automatically detecting these values is prefferable.
98  *
99  * If not explicitly stated otherwise, the member variables are <i>out-parameters</i> of <tt>readHeader()</tt>,
100  * <tt>readRecord()</tt> and <tt>readFooter()</tt>, i.e. they are set by these functions; and they are
101  * <i>in-parameters</i> to  <tt>writeHeader()</tt>, <tt>writeRecord()</tt> and <tt>writeFooter()</tt>, i.e. they
102  * influence these functions' output.
103  *
104  * See @link BlastTabularFileOut @endlink and @link BlastReportFileOut @endlink for more complete examples of usage.
105  */
106 
107 template <typename TScore_ = Blosum62,
108           BlastProgram p = BlastProgram::DYNAMIC,
109           BlastTabularSpec h = BlastTabularSpec::DYNAMIC>
110 struct BlastIOContext
111 {
112     typedef TScore_ TScore;
113     typedef typename BlastIOContextStringType_<BlastIOContext>::Type TString;
114 
115     /*!
116      * @var BlastProgramSelector BlastIOContext::blastProgram;
117      * @brief The @link BlastProgram @endlink.
118      *
119      * @section Remarks
120      *
121      * Behaves exactly like an enum of type @link BlastProgram @endlink, unless the second template parameter was
122      * specified to make this a compile-time constant. See @link BlastProgramSelector @endlink for more information.
123      */
124     BlastProgramSelector<p> blastProgram;
125 
126     /*!
127      * @var BlastTabularSpecSelector BlastIOContext::tabularSpec;
128      * @brief The @link BlastTabularSpec @endlink.
129      *
130      * @section Remarks
131      *
132      * Behaves exactly like an enum of type @link BlastTabularSpec @endlink, unless the third template parameter was
133      * specified to make this a compile-time constant. See @link BlastTabularSpecSelector @endlink for more information.
134      */
135     BlastTabularSpecSelector<h> tabularSpec;
136 
137     /*!
138      * @var BlastScoringScheme<TScore> BlastIOContext::scoringScheme;
139      * @brief The @link BlastScoringScheme @endlink.
140      */
141     BlastScoringScheme<TScore> scoringScheme;
142 
143     /*!
144      * @var TString BlastIOContext::versionString;
145      * @brief The blast version string.
146      *
147      * @section Remarks
148      *
149      * Used when writing @link BlastReportFileOut @endlink and @link BlastTabularFileOut @endlink if the context's tabularSpec
150      * is set to BlastTabularSpec::COMMENTS. Defaults to a version string based on the emulated
151      * blast version and the current SeqAn version.
152      * When reading from @link BlastTabularFileOut @endlink the corresponding line is extracted from the comment lines
153      * (if present).
154      */
155     TString versionString;
_setDefaultVersionStringBlastIOContext156     void _setDefaultVersionString()
157     {
158         clear(versionString);
159         append(versionString, _programTagToString(blastProgram));
160         append(versionString, " 2.2.26");
161         if (!legacyFormat)
162             append(versionString, "+");
163         append(versionString, " [I/O Module of SeqAn-");
164         append(versionString, std::to_string(SEQAN_VERSION_MAJOR));
165         append(versionString, '.');
166         append(versionString, std::to_string(SEQAN_VERSION_MINOR));
167         append(versionString, '.');
168         append(versionString, std::to_string(SEQAN_VERSION_PATCH));
169         append(versionString, ", http://www.seqan.de]");
170     }
171 
172     /*!
173      * @var bool BlastIOContext::legacyFormat;
174      * @brief Whether to use the legacy format (only @link BlastTabular @endlink).
175      *
176      * @section Remarks
177      *
178      * Setting this flag when writing to a @link BlastTabularFileOut @endlink (that has BlastTabularSpec::COMMENTS set)
179      * will result in the legacy version of the comments being written. In the legacy format the mismatches column
180      * also includes all gaps in addition to mismatches.
181      * Note that many other features like custom fields are not supported in this format.
182      *
183      * When reading @link BlastTabularFileOut @endlink this flag will automatically be set based on the comments (if a
184      * they exist).
185      */
186     bool legacyFormat = false;
187 
188     /*!
189      * @var TString BlastIOContext::dbName;
190      * @brief Name of the dabase or path to the file.
191      */
192     TString         dbName;
193 
194     /*!
195      * @var uint64_t BlastIOContext::dbTotalLength;
196      * @brief Summed up sequence length of the database.
197      */
198     uint64_t        dbTotalLength = 0u;
199 
200     /*!
201      * @var uint64_t BlastIOContext::dbNumberOfSeqs;
202      * @brief Number of sequences in the database.
203      */
204     uint64_t        dbNumberOfSeqs = 0u;
205 
206     /*!
207      * @var StringSet<TString> BlastIOContext::otherLines;
208      * @brief A StringSet that will contain all comment lines that
209      * could not be interpreted in another way (only @link BlastTabularFileIn @endlink).
210      */
211     StringSet<TString, Owner<ConcatDirect<>>> otherLines;
212 
213     /*!
214      * @var std::vector<BlastMatchField::Enum> BlastIOContext::fields;
215      * @brief The fields (types of columns) in @link BlastTabular @endlink-formats.
216      *
217      * @section Remarks
218      *
219      * This is an <i>out-parameter</i> for:
220      * <li> @link BlastTabularFileIn#readRecord @endlink iff tabularSpec == COMMENTS (otherwise it can't be deduced)</li>
221      *
222      * This is an <i>in-parameter</i> for:
223      * <li> @link BlastTabularFileIn#readRecord @endlink if tabularSpec != COMMENTS (specified fields will be expected)</li>
224      * <li> @link BlastTabularFileOut#writeRecord @endlink (specified fields will written)
225      *
226      * Setting @link BlastIOContext::ignoreFieldsInComments @endlink will make this variable be an <i>in-parameter</i> for
227      * the first case, as well. This variable is ignored in the legacy formats and for non-tabular formats.
228      */
229     std::vector<typename BlastMatchField<>::Enum> fields = { BlastMatchField<>::Enum::STD };
230 
231     /*!
232      * @var StringSet<TString> BlastIOContext::fieldsAsStrings;
233      * @brief The fields (types of columns) in @link BlastTabular @endlink-formats, but as uninterpreted strings.
234      *
235      * @section Remarks
236      *
237      * Useful when the comment lines do not conform to standards and you want to extract the verbatim column labels or
238      * if you wish to print non-standard column labels (which you shouldn't!).
239      */
240     StringSet<TString, Owner<ConcatDirect<>>> fieldsAsStrings;
241 
242     /*!
243      * @var bool BlastIOContext::ignoreFieldsInComments;
244      * @brief Use fields as in-parameter for readRecord as well (only @link BlastTabularFileIn @endlink).
245      *
246      * @section Remarks
247      *
248      * See @link BlastTabularFileIn#readRecord @endlink. Use this when the comment lines do not
249      * conform to standards (and the fields can't be read), but you know that
250      * the matches are in the given, e.g. default format.
251      */
252     bool ignoreFieldsInComments = false;
253 
254     /*!
255      * @var StringSet<TString> BlastIOContext::conformancyErrors;
256      * @brief Holds non fatal error messages when reading from @link BlastTabularFileIn @endlink.
257      *
258      * @section Remarks
259      *
260      * After doing a @link BlastTabularFileIn#readRecord @endlink this will indicate whether the
261      * comment lines contained non-fatal parse errors, usually the result
262      * of a file written by a sloppy blast implementation or possibly a bug in SeqAn.
263      * An empty StringSet indicates that all is good.
264      */
265     StringSet<TString, Owner<ConcatDirect<>>> conformancyErrors;
266 
267     // ------- CACHES, BUFFERS and INTERNALS --------- //
268 
269     // counted internally for TabularFooter
270     uint64_t _numberOfRecords = 0;
271 
272     // cache for length adjustments in blast statistics
273     std::unordered_map<uint64_t, uint64_t> _cachedLengthAdjustments;
274 
275     // io-buffers
276     TString _lineBuffer; // holds the current line
277     TString _stringBuffer;
278     StringSet<TString, Owner<ConcatDirect<>>> _setBuffer1;
279     StringSet<TString, Owner<ConcatDirect<>>> _setBuffer2;
280     BlastMatch<> bufMatch;
281     BlastRecord<> bufRecord;
282 };
283 
284 }
285 
286 #endif
287