1 // ========================================================================== 2 // SeqAn - The Library for Sequence Analysis 3 // ========================================================================== 4 // Copyright (c) 2006-2018, Knut Reinert, FU Berlin 5 // All rights reserved. 6 // 7 // Redistribution and use in source and binary forms, with or without 8 // modification, are permitted provided that the following conditions are met: 9 // 10 // * Redistributions of source code must retain the above copyright 11 // notice, this list of conditions and the following disclaimer. 12 // * Redistributions in binary form must reproduce the above copyright 13 // notice, this list of conditions and the following disclaimer in the 14 // documentation and/or other materials provided with the distribution. 15 // * Neither the name of Knut Reinert or the FU Berlin nor the names of 16 // its contributors may be used to endorse or promote products derived 17 // from this software without specific prior written permission. 18 // 19 // THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" 20 // AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE 21 // IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE 22 // ARE DISCLAIMED. IN NO EVENT SHALL KNUT REINERT OR THE FU BERLIN BE LIABLE 23 // FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL 24 // DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR 25 // SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER 26 // CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT 27 // LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY 28 // OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH 29 // DAMAGE. 30 // 31 // ========================================================================== 32 // Author: Hannes Hauswedell <hannes.hauswedell@fu-berlin.de> 33 // ========================================================================== 34 // This file contains the BlastIOContext's code 35 // ========================================================================== 36 37 #ifndef SEQAN_BLAST_BLAST_IO_CONTEXT_H__ 38 #define SEQAN_BLAST_BLAST_IO_CONTEXT_H__ 39 40 namespace seqan 41 { 42 43 // ============================================================================ 44 // Forwards 45 // ============================================================================ 46 47 template <typename TScore> 48 struct BlastScoringScheme; 49 50 // ============================================================================ 51 // Metafunctions 52 // ============================================================================ 53 54 // ---------------------------------------------------------------------------- 55 // Mfn BlastIOContextStringType_ 56 // ---------------------------------------------------------------------------- 57 58 template <typename TContext> 59 struct BlastIOContextStringType_ 60 { 61 typedef std::string Type; 62 }; 63 64 // ============================================================================ 65 // Tags, Classes, Enums 66 // ============================================================================ 67 68 // ---------------------------------------------------------------------------- 69 // Class BlastIOContext 70 // ---------------------------------------------------------------------------- 71 72 /*! 73 * @class BlastIOContext 74 * @headerfile <seqan/blast.h> 75 * @signature template <typename TScore_ = Blosum62, 76 * BlastProgram p = BlastProgram::UNKNOWN, BlastTabularSpec h = BlastTabularSpec::UNKNOWN> 77 * struct BlastIOContext { ... }; 78 * 79 * @brief An object that holds file global information and buffers for BlastIO 80 * 81 * @tparam TScore Type of the @link Score @endlink object used. 82 * @tparam p @link BlastProgram @endlink as compile-time parameter. 83 * @tparam h @link BlastTabularSpec @endlink as compile-time parameter. 84 * 85 * This is a part of the Blast formatted files. Before writing, some of the context's members should be set; after 86 * reading it will contain 87 * all information from the file that did not belong to a @link BlastRecord @endlink, e.g. the name of the database. 88 * It also contains buffers for internal use. 89 * 90 * You should re-use this object (i.e. only create it once for 91 * every file that you read/write). And you don't need to and should not clear() 92 * this, except when restarting IO on a different file. 93 * 94 * To speed-up file writing slightly you can set the value template parameters <tt>p</tt> and/or <tt>h</tt> to something 95 * other than ::DYNAMIC at compile-time (e.g. if you know that you will be printing only BLASTX), but then you won't 96 * be able to modify these values at run-time. For file reading this is also possible, but usually the 97 * added flexibility of automatically detecting these values is prefferable. 98 * 99 * If not explicitly stated otherwise, the member variables are <i>out-parameters</i> of <tt>readHeader()</tt>, 100 * <tt>readRecord()</tt> and <tt>readFooter()</tt>, i.e. they are set by these functions; and they are 101 * <i>in-parameters</i> to <tt>writeHeader()</tt>, <tt>writeRecord()</tt> and <tt>writeFooter()</tt>, i.e. they 102 * influence these functions' output. 103 * 104 * See @link BlastTabularFileOut @endlink and @link BlastReportFileOut @endlink for more complete examples of usage. 105 */ 106 107 template <typename TScore_ = Blosum62, 108 BlastProgram p = BlastProgram::DYNAMIC, 109 BlastTabularSpec h = BlastTabularSpec::DYNAMIC> 110 struct BlastIOContext 111 { 112 typedef TScore_ TScore; 113 typedef typename BlastIOContextStringType_<BlastIOContext>::Type TString; 114 115 /*! 116 * @var BlastProgramSelector BlastIOContext::blastProgram; 117 * @brief The @link BlastProgram @endlink. 118 * 119 * @section Remarks 120 * 121 * Behaves exactly like an enum of type @link BlastProgram @endlink, unless the second template parameter was 122 * specified to make this a compile-time constant. See @link BlastProgramSelector @endlink for more information. 123 */ 124 BlastProgramSelector<p> blastProgram; 125 126 /*! 127 * @var BlastTabularSpecSelector BlastIOContext::tabularSpec; 128 * @brief The @link BlastTabularSpec @endlink. 129 * 130 * @section Remarks 131 * 132 * Behaves exactly like an enum of type @link BlastTabularSpec @endlink, unless the third template parameter was 133 * specified to make this a compile-time constant. See @link BlastTabularSpecSelector @endlink for more information. 134 */ 135 BlastTabularSpecSelector<h> tabularSpec; 136 137 /*! 138 * @var BlastScoringScheme<TScore> BlastIOContext::scoringScheme; 139 * @brief The @link BlastScoringScheme @endlink. 140 */ 141 BlastScoringScheme<TScore> scoringScheme; 142 143 /*! 144 * @var TString BlastIOContext::versionString; 145 * @brief The blast version string. 146 * 147 * @section Remarks 148 * 149 * Used when writing @link BlastReportFileOut @endlink and @link BlastTabularFileOut @endlink if the context's tabularSpec 150 * is set to BlastTabularSpec::COMMENTS. Defaults to a version string based on the emulated 151 * blast version and the current SeqAn version. 152 * When reading from @link BlastTabularFileOut @endlink the corresponding line is extracted from the comment lines 153 * (if present). 154 */ 155 TString versionString; _setDefaultVersionStringBlastIOContext156 void _setDefaultVersionString() 157 { 158 clear(versionString); 159 append(versionString, _programTagToString(blastProgram)); 160 append(versionString, " 2.2.26"); 161 if (!legacyFormat) 162 append(versionString, "+"); 163 append(versionString, " [I/O Module of SeqAn-"); 164 append(versionString, std::to_string(SEQAN_VERSION_MAJOR)); 165 append(versionString, '.'); 166 append(versionString, std::to_string(SEQAN_VERSION_MINOR)); 167 append(versionString, '.'); 168 append(versionString, std::to_string(SEQAN_VERSION_PATCH)); 169 append(versionString, ", http://www.seqan.de]"); 170 } 171 172 /*! 173 * @var bool BlastIOContext::legacyFormat; 174 * @brief Whether to use the legacy format (only @link BlastTabular @endlink). 175 * 176 * @section Remarks 177 * 178 * Setting this flag when writing to a @link BlastTabularFileOut @endlink (that has BlastTabularSpec::COMMENTS set) 179 * will result in the legacy version of the comments being written. In the legacy format the mismatches column 180 * also includes all gaps in addition to mismatches. 181 * Note that many other features like custom fields are not supported in this format. 182 * 183 * When reading @link BlastTabularFileOut @endlink this flag will automatically be set based on the comments (if a 184 * they exist). 185 */ 186 bool legacyFormat = false; 187 188 /*! 189 * @var TString BlastIOContext::dbName; 190 * @brief Name of the dabase or path to the file. 191 */ 192 TString dbName; 193 194 /*! 195 * @var uint64_t BlastIOContext::dbTotalLength; 196 * @brief Summed up sequence length of the database. 197 */ 198 uint64_t dbTotalLength = 0u; 199 200 /*! 201 * @var uint64_t BlastIOContext::dbNumberOfSeqs; 202 * @brief Number of sequences in the database. 203 */ 204 uint64_t dbNumberOfSeqs = 0u; 205 206 /*! 207 * @var StringSet<TString> BlastIOContext::otherLines; 208 * @brief A StringSet that will contain all comment lines that 209 * could not be interpreted in another way (only @link BlastTabularFileIn @endlink). 210 */ 211 StringSet<TString, Owner<ConcatDirect<>>> otherLines; 212 213 /*! 214 * @var std::vector<BlastMatchField::Enum> BlastIOContext::fields; 215 * @brief The fields (types of columns) in @link BlastTabular @endlink-formats. 216 * 217 * @section Remarks 218 * 219 * This is an <i>out-parameter</i> for: 220 * <li> @link BlastTabularFileIn#readRecord @endlink iff tabularSpec == COMMENTS (otherwise it can't be deduced)</li> 221 * 222 * This is an <i>in-parameter</i> for: 223 * <li> @link BlastTabularFileIn#readRecord @endlink if tabularSpec != COMMENTS (specified fields will be expected)</li> 224 * <li> @link BlastTabularFileOut#writeRecord @endlink (specified fields will written) 225 * 226 * Setting @link BlastIOContext::ignoreFieldsInComments @endlink will make this variable be an <i>in-parameter</i> for 227 * the first case, as well. This variable is ignored in the legacy formats and for non-tabular formats. 228 */ 229 std::vector<typename BlastMatchField<>::Enum> fields = { BlastMatchField<>::Enum::STD }; 230 231 /*! 232 * @var StringSet<TString> BlastIOContext::fieldsAsStrings; 233 * @brief The fields (types of columns) in @link BlastTabular @endlink-formats, but as uninterpreted strings. 234 * 235 * @section Remarks 236 * 237 * Useful when the comment lines do not conform to standards and you want to extract the verbatim column labels or 238 * if you wish to print non-standard column labels (which you shouldn't!). 239 */ 240 StringSet<TString, Owner<ConcatDirect<>>> fieldsAsStrings; 241 242 /*! 243 * @var bool BlastIOContext::ignoreFieldsInComments; 244 * @brief Use fields as in-parameter for readRecord as well (only @link BlastTabularFileIn @endlink). 245 * 246 * @section Remarks 247 * 248 * See @link BlastTabularFileIn#readRecord @endlink. Use this when the comment lines do not 249 * conform to standards (and the fields can't be read), but you know that 250 * the matches are in the given, e.g. default format. 251 */ 252 bool ignoreFieldsInComments = false; 253 254 /*! 255 * @var StringSet<TString> BlastIOContext::conformancyErrors; 256 * @brief Holds non fatal error messages when reading from @link BlastTabularFileIn @endlink. 257 * 258 * @section Remarks 259 * 260 * After doing a @link BlastTabularFileIn#readRecord @endlink this will indicate whether the 261 * comment lines contained non-fatal parse errors, usually the result 262 * of a file written by a sloppy blast implementation or possibly a bug in SeqAn. 263 * An empty StringSet indicates that all is good. 264 */ 265 StringSet<TString, Owner<ConcatDirect<>>> conformancyErrors; 266 267 // ------- CACHES, BUFFERS and INTERNALS --------- // 268 269 // counted internally for TabularFooter 270 uint64_t _numberOfRecords = 0; 271 272 // cache for length adjustments in blast statistics 273 std::unordered_map<uint64_t, uint64_t> _cachedLengthAdjustments; 274 275 // io-buffers 276 TString _lineBuffer; // holds the current line 277 TString _stringBuffer; 278 StringSet<TString, Owner<ConcatDirect<>>> _setBuffer1; 279 StringSet<TString, Owner<ConcatDirect<>>> _setBuffer2; 280 BlastMatch<> bufMatch; 281 BlastRecord<> bufRecord; 282 }; 283 284 } 285 286 #endif 287