1 #ifndef FORMATGUESS__HPP
2 #define FORMATGUESS__HPP
3
4 /* $Id: format_guess.hpp 629211 2021-04-12 18:52:01Z ivanov $
5 * ===========================================================================
6 *
7 * PUBLIC DOMAIN NOTICE
8 * National Center for Biotechnology Information
9 *
10 * This software/database is a "United States Government Work" under the
11 * terms of the United States Copyright Act. It was written as part of
12 * the author's official duties as a United States Government employee and
13 * thus cannot be copyrighted. This software/database is freely available
14 * to the public for use. The National Library of Medicine and the U.S.
15 * Government have not placed any restriction on its use or reproduction.
16 *
17 * Although all reasonable efforts have been taken to ensure the accuracy
18 * and reliability of the software and data, the NLM and the U.S.
19 * Government do not and cannot warrant the performance or results that
20 * may be obtained by using this software or data. The NLM and the U.S.
21 * Government disclaim all warranties, express or implied, including
22 * warranties of performance, merchantability or fitness for any particular
23 * purpose.
24 *
25 * Please cite the author in any work or product based on this material.
26 *
27 * ===========================================================================
28 *
29 * Author: Anatoliy Kuznetsov
30 *
31 * File Description: Different "fuzzy-logic" methods to identify file formats.
32 *
33 */
34
35 #include <corelib/ncbistd.hpp>
36 #include <bitset>
37
38 BEGIN_NCBI_SCOPE
39
40 class CFormatGuessHints;
41
42
43 //////////////////////////////////////////////////////////////////
44 ///
45 /// Class implements different ad-hoc unreliable file format
46 /// identifications.
47 ///
48
49 class NCBI_XUTIL_EXPORT CFormatGuess
50 {
51 public:
52 /// The formats are checked in the same order as declared here.
53 enum EFormat {
54 // WARNING! Never change numeric values of these enumerators!
55 // E.g. these values are hard-coded in the Local Data Storage (LDS)
56 // index databases.
57 eUnknown = 0, ///< unknown format
58 eBinaryASN = 1, ///< Binary ASN.1
59 eRmo = 2, ///< RepeatMasker Output
60 eGtf_POISENED = 3, ///< Old and Dead GFF/GTF style annotations
61 eGlimmer3 = 4, ///< Glimmer3 predictions
62 eAgp = 5, ///< AGP format assembly, AgpRead
63 eXml = 6, ///< XML
64 eWiggle = 7, ///< UCSC WIGGLE file format
65 eBed = 8, ///< UCSC BED file format, CBedReader
66 eBed15 = 9, ///< UCSC BED15 or microarray format
67 eNewick = 10, ///< Newick file
68 eAlignment = 11, ///< Text alignment
69 eDistanceMatrix = 12, ///< Distance matrix file
70 eFlatFileSequence = 13, ///< GenBank/GenPept/DDBJ/EMBL flat-file
71 ///< sequence portion
72 eFiveColFeatureTable = 14, ///< Five-column feature table
73 eSnpMarkers = 15, ///< SNP Marker flat file
74 eFasta = 16, ///< FASTA format sequence record, CFastaReader
75 eTextASN = 17, ///< Text ASN.1
76 eTaxplot = 18, ///< Taxplot file
77 ePhrapAce = 19, ///< Phrap ACE assembly file
78 eTable = 20, ///< Generic table
79 eGtf = 21, ///< New GTF, CGtfReader
80 eGff3 = 22, ///< GFF3, CGff3Reader
81 eGff2 = 23, ///< GFF2, CGff2Reader, any GFF-like that doesn't fit the others
82 eHgvs = 24, ///< HGVS, CHgvsParser
83 eGvf = 25, ///< GVF, CGvfReader
84 eZip = 26, ///< zip compressed file
85 eGZip = 27, ///< GNU zip compressed file
86 eBZip2 = 28, ///< bzip2 compressed file
87 eLzo = 29, ///< lzo compressed file
88 eSra = 30, ///< INSDC Sequence Read Archive file
89 eBam = 31, ///< Binary alignment/map file
90 eVcf = 32, ///< VCF, CVcfReader
91 eUCSCRegion = 33, ///< USCS Region file format
92 eGffAugustus = 34, ///< GFFish output of Augustus Gene Prediction
93 eJSON = 35, ///< JSON
94 ePsl = 36, ///< PSL alignment format
95 // The following formats are not yet recognized by CFormatGuess - CXX-10039
96 eAltGraphX = 37,
97 eBed5FloatScore = 38,
98 eBedGraph = 39,
99 eBedRnaElements = 40,
100 eBigBarChart = 41,
101 eBigBed = 42,
102 eBigPsl = 43,
103 eBigChain = 44,
104 eBigMaf = 45,
105 eBigWig = 46,
106 eBroadPeak = 47,
107 eChain = 48,
108 eClonePos = 49,
109 eColoredExon = 50,
110 eCtgPos = 51,
111 eDownloadsOnly = 52,
112 eEncodeFiveC = 53,
113 eExpRatio = 54,
114 eFactorSource = 55,
115 eGenePred = 56,
116 eLd2 = 57,
117 eNarrowPeak = 58,
118 eNetAlign = 59,
119 ePeptideMapping = 60,
120 eRmsk = 61,
121 eSnake = 62,
122 eVcfTabix = 63,
123 eWigMaf = 64,
124
125 // The following formats *are* recognized by CFormatGuess:
126 eFlatFileGenbank = 65,
127 eFlatFileEna = 66,
128 eFlatFileUniProt = 67,
129
130 // *** Adding new format codes? ***
131 // (1) A sanity check in the implementation depends on the format codes being
132 // consecutive. Hence no gaps allowed!
133 // (2) Heed the warning above about never changing an already existing
134 // format code!
135 // (3) You must provide a display name for the new format. Do that in
136 // sm_FormatNames.
137 // (4) You must add your new format to sm_CheckOrder (unless you don't want your
138 // format actually being checked and recognized.
139
140 /// Max value of EFormat
141 eFormat_max
142 };
143
144 enum ESequenceType {
145 eUndefined,
146 eNucleotide,
147 eProtein
148 };
149
150 enum EMode {
151 eQuick,
152 eThorough
153 };
154
155 enum ESTStrictness {
156 eST_Lax, ///< Implement historic behavior, risking false positives.
157 eST_Default, ///< Be relatively strict, but still allow for typos.
158 eST_Strict ///< Require 100% encodability of printable non-digits.
159 };
160
161 enum EOnError {
162 eDefault = 0, ///< Return eUnknown
163 eThrowOnBadSource, ///< Throw an exception if the data source (stream, file) can't be read
164 };
165
166 static bool IsSupportedFormat(EFormat format);
167
168 /// Hints for guessing formats. Two hint types can be used: preferred and
169 /// disabled. Preferred are checked before any other formats. Disabled
170 /// formats are not checked at all.
171 class CFormatHints
172 {
173 public:
174 typedef CFormatGuess::EFormat TFormat;
175
CFormatHints(void)176 CFormatHints(void) {}
177
178 /// Mark the format as preferred.
179 CFormatHints& AddPreferredFormat(TFormat fmt);
180 /// Mark the format as disabled.
181 CFormatHints& AddDisabledFormat(TFormat fmt);
182 /// Disable all formats not marked as preferred
183 CFormatHints& DisableAllNonpreferred(void);
184 /// Remove format hint.
185 void RemoveFormat(TFormat fmt);
186 /// Remove all hints
187 CFormatHints& Reset(void);
188
189 /// Check if there are any hints are set at all.
190 bool IsEmpty(void) const;
191 /// Check if the format is listed as preferred.
192 bool IsPreferred(TFormat fmt) const;
193 /// Check if the format is listed as disabled.
194 bool IsDisabled(TFormat fmt) const;
195
196 private:
197 typedef bitset<CFormatGuess::eFormat_max> THints;
198
199 THints m_Preferred;
200 THints m_Disabled;
201 };
202
203 /// Guess sequence type. Function calculates sequence alphabet and
204 /// identifies if the source belongs to nucleotide or protein sequence
205 static ESequenceType SequenceType(const char* str, unsigned length = 0,
206 ESTStrictness strictness = eST_Default);
207
208 static const char* GetFormatName(EFormat format);
209
210 // ----------------------------------------------------------------------
211 // "Stateless" interface:
212 // Useful for checking for all formats in one simple call.
213 // May go away; use object interface instead.
214 // ----------------------------------------------------------------------
215
216 /// Guess file format
217 static
218 EFormat Format(const string& path, EOnError onerror = eDefault);
219
220 /// Format prediction based on an input stream
221 /// @note On completion, the function pushes whatever data it had to read
222 /// (in order to detect data format) back to the stream -- using
223 /// CStreamUtils::Stepback()
224 static
225 EFormat Format(CNcbiIstream& input, EOnError onerror = eDefault);
226
227
228 // ----------------------------------------------------------------------
229 // "Object" interface:
230 // Use when interested only in a limited number of formats, in excluding
231 // certain tests, a specific order in which formats are tested, ...
232 // ----------------------------------------------------------------------
233
234 CFormatGuess();
235
236 CFormatGuess(const string& fname);
237
238 /// @note Data format detection methods GuessFormat() and TestFormat()
239 /// take care to push whatever data they read back to the stream
240 /// using CStreamUtils::Stepback()
241 CFormatGuess(CNcbiIstream& input);
242
243 ~CFormatGuess();
244
245
246 NCBI_DEPRECATED EFormat GuessFormat(EMode);
247 NCBI_DEPRECATED bool TestFormat(EFormat, EMode);
248
249 /// @note If the instance of the class is built upon std::istream, then
250 /// on completion this function pushes whatever data it had to read
251 /// (in order to detect data format) back to the stream -- using
252 /// CStreamUtils::Stepback()
253 EFormat GuessFormat(EOnError onerror = eDefault);
254
255
256 /// @note If the instance of the class is built upon std::istream, then
257 /// on completion this function pushes whatever data it had to read
258 /// (in order to detect data format) back to the stream -- using
259 /// CStreamUtils::Stepback()
260 bool TestFormat(EFormat, EOnError onerror = eDefault);
261
262 /// Get format hints
GetFormatHints(void)263 CFormatHints& GetFormatHints(void) { return m_Hints; }
264
265 /// Check whether testing is enabled for given format
IsEnabled(EFormat format) const266 bool IsEnabled(EFormat format) const { return !m_Hints.IsDisabled(format); };
267
268 protected:
269 void Initialize();
270
271 bool EnsureTestBuffer();
272 bool EnsureStats();
273 bool EnsureSplitLines();
274 bool IsAllComment();
275 bool IsAsciiText();
276
277 bool TestFormatRepeatMasker(EMode);
278 bool TestFormatPhrapAce(EMode);
279 bool TestFormatGtf(EMode);
280 bool TestFormatGvf(EMode);
281 bool TestFormatGff3(EMode);
282 bool TestFormatGff2(EMode);
283 bool TestFormatGlimmer3(EMode);
284 bool TestFormatAgp(EMode);
285 bool TestFormatNewick(EMode);
286 bool TestFormatXml(EMode);
287 bool TestFormatAlignment(EMode);
288 bool TestFormatCLUSTAL(void);
289 bool TestFormatBinaryAsn(EMode);
290 bool TestFormatDistanceMatrix(EMode);
291 bool TestFormatTaxplot(EMode);
292 bool TestFormatFlatFileSequence(EMode);
293 bool TestFormatFiveColFeatureTable(EMode);
294 bool TestFormatTable(EMode);
295 bool TestFormatFasta(EMode);
296 bool TestFormatTextAsn(EMode);
297 bool TestFormatSnpMarkers(EMode);
298 bool TestFormatBed(EMode);
299 bool TestFormatBed15(EMode);
300 bool TestFormatWiggle(EMode);
301 bool TestFormatHgvs(EMode);
302 bool TestFormatZip(EMode);
303 bool TestFormatGZip(EMode);
304 bool TestFormatBZip2(EMode);
305 bool TestFormatLzo(EMode);
306 bool TestFormatSra(EMode);
307 bool TestFormatBam(EMode);
308 bool TestFormatVcf(EMode);
309 bool TestFormatAugustus(EMode);
310 bool TestFormatJson(EMode);
311 bool TestFormatPsl(EMode);
312
313 bool TestFormatFlatFileGenbank(EMode);
314 bool TestFormatFlatFileEna(EMode);
315 bool TestFormatFlatFileUniProt(EMode);
316
317 bool IsInputRepeatMaskerWithoutHeader();
318 bool IsInputRepeatMaskerWithHeader();
319
320 static bool IsLineFlatFileSequence(const std::string&);
321 static bool IsSampleNewick(const std::string&);
322 static bool IsLabelNewick(const std::string&);
323 static bool IsLineAgp(const std::string&);
324 static bool IsLineGlimmer3(const std::string&);
325 static bool IsLineGtf(const std::string&);
326 static bool IsLineGvf(const std::string&);
327 static bool IsLineGff3(const std::string&);
328 static bool IsLineGff2(const std::string&);
329 static bool IsLineAugustus(const std::string&);
330 static bool IsLinePhrapId(const std::string&);
331 static bool IsLineRmo(const std::string&);
332 static bool IsAsnComment(const vector<string>&);
333 static bool IsLineHgvs(const std::string&);
334 static bool IsLinePsl(const std::string&, bool ignoreFirstColumn);
335
336 private:
337 static bool x_TestInput( CNcbiIstream& input, EOnError onerror );
338
339 bool x_TestFormat(EFormat format, EMode mode);
340
341 // to test for a table we check each of the most common delimiter combitions,
342 // ' ' ' \t' '\t' ',' '|'
343 bool x_TestTableDelimiter(const string& delims);
344
345 // Check that the beginning of testString looks like JSON
346 bool x_CheckJsonStart(const string& testString) const;
347
348 // In-place deletion of JSON strings
349 void x_StripJsonStrings(string& testString) const;
350
351 // Starting at from_pos, find the next set of double quotes
352 // indicating the end of a JSON string
353 size_t x_FindNextJsonStringStop(const string& input, const size_t from_pos) const;
354
355 void x_FindJsonStringLimits(const string& testString, list<size_t>& limits) const;
356
357 // Checks and removes punctuation from testString
358 bool x_CheckStripJsonPunctuation(string& testString) const;
359
360 // In-place deletion of JSON punctuation
361 // Returns the number of characters deleted.
362 size_t x_StripJsonPunctuation(string& testString) const;
363
364 // In-place deletion of JSON keywords: true, false, null
365 void x_StripJsonKeywords(string& testString) const;
366
367 bool x_CheckStripJsonNumbers(string& testString) const;
368
369 bool x_IsTruncatedJsonNumber(const string& testString) const;
370
371 // Is a truncation of true, false, or null
372 bool x_IsTruncatedJsonKeyword(const string& testString) const;
373
374 bool x_IsNumber(const string& testString) const;
375
376 // Return true if the string is blank or a list of space-delimited numbers
377 bool x_IsBlankOrNumbers(const string& testString) const;
378
379 // data:
380 using NAME_MAP = map<EFormat, const char*>;
381 static const NAME_MAP sm_FormatNames;
382
383 bool x_TryProcessCLUSTALSeqData(const string& line, string& id, size_t& seg_length) const;
384
385 bool x_LooksLikeCLUSTALConservedInfo(const string& line) const;
386
387 protected:
388 static vector<int> sm_CheckOrder;
389
390 static const streamsize s_iTestBufferGranularity = 8096;
391
392
393 CNcbiIstream& m_Stream;
394 bool m_bOwnsStream;
395 char* m_pTestBuffer;
396 streamsize m_iTestBufferSize;
397 streamsize m_iTestDataSize;
398
399 bool m_bStatsAreValid;
400 bool m_bSplitDone;
401 unsigned int m_iStatsCountData;
402 unsigned int m_iStatsCountAlNumChars;
403 unsigned int m_iStatsCountDnaChars;
404 unsigned int m_iStatsCountAaChars;
405 unsigned int m_iStatsCountBraces;
406 std::list<std::string> m_TestLines;
407 CFormatHints m_Hints;
408 };
409
410
411 inline CFormatGuess::CFormatHints&
AddPreferredFormat(TFormat fmt)412 CFormatGuess::CFormatHints::AddPreferredFormat(TFormat fmt)
413 {
414 m_Disabled.reset(fmt);
415 m_Preferred.set(fmt);
416 return *this;
417 }
418
419
420 inline CFormatGuess::CFormatHints&
AddDisabledFormat(TFormat fmt)421 CFormatGuess::CFormatHints::AddDisabledFormat(TFormat fmt)
422 {
423 m_Preferred.reset(fmt);
424 m_Disabled.set(fmt);
425 return *this;
426 }
427
428 inline CFormatGuess::CFormatHints&
DisableAllNonpreferred(void)429 CFormatGuess::CFormatHints::DisableAllNonpreferred(void)
430 {
431 m_Disabled = ~m_Preferred;
432 return *this;
433 }
434
RemoveFormat(TFormat fmt)435 inline void CFormatGuess::CFormatHints::RemoveFormat(TFormat fmt)
436 {
437 m_Disabled.reset(fmt);
438 m_Preferred.reset(fmt);
439 }
440
441 inline CFormatGuess::CFormatHints&
Reset(void)442 CFormatGuess::CFormatHints::Reset(void)
443 {
444 m_Preferred.reset();
445 m_Disabled.reset();
446 return *this;
447 }
448
IsEmpty(void) const449 inline bool CFormatGuess::CFormatHints::IsEmpty(void) const
450 {
451 return m_Preferred.count() == 0 && m_Disabled.count() == 0;
452 }
453
IsPreferred(TFormat fmt) const454 inline bool CFormatGuess::CFormatHints::IsPreferred(TFormat fmt) const
455 {
456 return m_Preferred.test(fmt);
457 }
458
IsDisabled(TFormat fmt) const459 inline bool CFormatGuess::CFormatHints::IsDisabled(TFormat fmt) const
460 {
461 return m_Disabled.test(fmt);
462 }
463
464 END_NCBI_SCOPE
465
466 #endif
467