1 #ifndef FORMATGUESS__HPP
2 #define FORMATGUESS__HPP
3 
4 /*  $Id: format_guess.hpp 629211 2021-04-12 18:52:01Z ivanov $
5  * ===========================================================================
6  *
7  *                            PUBLIC DOMAIN NOTICE
8  *               National Center for Biotechnology Information
9  *
10  *  This software/database is a "United States Government Work" under the
11  *  terms of the United States Copyright Act.  It was written as part of
12  *  the author's official duties as a United States Government employee and
13  *  thus cannot be copyrighted.  This software/database is freely available
14  *  to the public for use. The National Library of Medicine and the U.S.
15  *  Government have not placed any restriction on its use or reproduction.
16  *
17  *  Although all reasonable efforts have been taken to ensure the accuracy
18  *  and reliability of the software and data, the NLM and the U.S.
19  *  Government do not and cannot warrant the performance or results that
20  *  may be obtained by using this software or data. The NLM and the U.S.
21  *  Government disclaim all warranties, express or implied, including
22  *  warranties of performance, merchantability or fitness for any particular
23  *  purpose.
24  *
25  *  Please cite the author in any work or product based on this material.
26  *
27  * ===========================================================================
28  *
29  * Author: Anatoliy Kuznetsov
30  *
31  * File Description:  Different "fuzzy-logic" methods to identify file formats.
32  *
33  */
34 
35 #include <corelib/ncbistd.hpp>
36 #include <bitset>
37 
38 BEGIN_NCBI_SCOPE
39 
40 class CFormatGuessHints;
41 
42 
43 //////////////////////////////////////////////////////////////////
44 ///
45 /// Class implements different ad-hoc unreliable file format
46 /// identifications.
47 ///
48 
49 class NCBI_XUTIL_EXPORT CFormatGuess
50 {
51 public:
52     /// The formats are checked in the same order as declared here.
53     enum EFormat {
54         // WARNING! Never change numeric values of these enumerators!
55         // E.g. these values are hard-coded in the Local Data Storage (LDS)
56         // index databases.
57         eUnknown             =  0, ///< unknown format
58         eBinaryASN           =  1, ///< Binary ASN.1
59         eRmo                 =  2, ///< RepeatMasker Output
60         eGtf_POISENED        =  3, ///< Old and Dead GFF/GTF style annotations
61         eGlimmer3            =  4, ///< Glimmer3 predictions
62         eAgp                 =  5, ///< AGP format assembly, AgpRead
63         eXml                 =  6, ///< XML
64         eWiggle              =  7, ///< UCSC WIGGLE file format
65         eBed                 =  8, ///< UCSC BED file format, CBedReader
66         eBed15               =  9, ///< UCSC BED15 or microarray format
67         eNewick              = 10, ///< Newick file
68         eAlignment           = 11, ///< Text alignment
69         eDistanceMatrix      = 12, ///< Distance matrix file
70         eFlatFileSequence    = 13, ///< GenBank/GenPept/DDBJ/EMBL flat-file
71                                    ///< sequence portion
72         eFiveColFeatureTable = 14, ///< Five-column feature table
73         eSnpMarkers          = 15, ///< SNP Marker flat file
74         eFasta               = 16, ///< FASTA format sequence record, CFastaReader
75         eTextASN             = 17, ///< Text ASN.1
76         eTaxplot             = 18, ///< Taxplot file
77         ePhrapAce            = 19, ///< Phrap ACE assembly file
78         eTable               = 20, ///< Generic table
79         eGtf                 = 21, ///< New GTF, CGtfReader
80         eGff3                = 22, ///< GFF3, CGff3Reader
81         eGff2                = 23, ///< GFF2, CGff2Reader, any GFF-like that doesn't fit the others
82         eHgvs                = 24, ///< HGVS, CHgvsParser
83         eGvf                 = 25, ///< GVF, CGvfReader
84         eZip                 = 26, ///< zip compressed file
85         eGZip                = 27, ///< GNU zip compressed file
86         eBZip2               = 28, ///< bzip2 compressed file
87         eLzo                 = 29, ///< lzo compressed file
88         eSra                 = 30, ///< INSDC Sequence Read Archive file
89         eBam                 = 31, ///< Binary alignment/map file
90         eVcf                 = 32, ///< VCF, CVcfReader
91         eUCSCRegion          = 33, ///< USCS Region file format
92         eGffAugustus         = 34, ///< GFFish output of Augustus Gene Prediction
93         eJSON                = 35, ///< JSON
94         ePsl                 = 36, ///< PSL alignment format
95         // The following formats are not yet recognized by CFormatGuess - CXX-10039
96         eAltGraphX           = 37,
97         eBed5FloatScore      = 38,
98         eBedGraph            = 39,
99         eBedRnaElements      = 40,
100         eBigBarChart         = 41,
101         eBigBed              = 42,
102         eBigPsl              = 43,
103         eBigChain            = 44,
104         eBigMaf              = 45,
105         eBigWig              = 46,
106         eBroadPeak           = 47,
107         eChain               = 48,
108         eClonePos            = 49,
109         eColoredExon         = 50,
110         eCtgPos              = 51,
111         eDownloadsOnly       = 52,
112         eEncodeFiveC         = 53,
113         eExpRatio            = 54,
114         eFactorSource        = 55,
115         eGenePred            = 56,
116         eLd2                 = 57,
117         eNarrowPeak          = 58,
118         eNetAlign            = 59,
119         ePeptideMapping      = 60,
120         eRmsk                = 61,
121         eSnake               = 62,
122         eVcfTabix            = 63,
123         eWigMaf              = 64,
124 
125         // The following formats *are* recognized by CFormatGuess:
126         eFlatFileGenbank     = 65,
127         eFlatFileEna         = 66,
128         eFlatFileUniProt     = 67,
129 
130         // ***  Adding new format codes?  ***
131         //  (1) A sanity check in the  implementation depends on the format codes being
132         //      consecutive. Hence no gaps allowed!
133         //  (2) Heed the warning above about never changing an already existing
134         //      format code!
135         //  (3) You must provide a display name for the new format. Do that in
136         //      sm_FormatNames.
137         //  (4) You must add your new format to sm_CheckOrder (unless you don't want your
138         //      format actually being checked and recognized.
139 
140         /// Max value of EFormat
141         eFormat_max
142     };
143 
144     enum ESequenceType {
145         eUndefined,
146         eNucleotide,
147         eProtein
148     };
149 
150     enum EMode {
151         eQuick,
152         eThorough
153     };
154 
155     enum ESTStrictness {
156         eST_Lax,     ///< Implement historic behavior, risking false positives.
157         eST_Default, ///< Be relatively strict, but still allow for typos.
158         eST_Strict   ///< Require 100% encodability of printable non-digits.
159     };
160 
161     enum EOnError {
162         eDefault = 0,      ///< Return eUnknown
163         eThrowOnBadSource, ///< Throw an exception if the data source (stream, file) can't be read
164     };
165 
166     static bool IsSupportedFormat(EFormat format);
167 
168     /// Hints for guessing formats. Two hint types can be used: preferred and
169     /// disabled. Preferred are checked before any other formats. Disabled
170     /// formats are not checked at all.
171     class CFormatHints
172     {
173     public:
174         typedef CFormatGuess::EFormat TFormat;
175 
CFormatHints(void)176         CFormatHints(void) {}
177 
178         /// Mark the format as preferred.
179         CFormatHints& AddPreferredFormat(TFormat fmt);
180         /// Mark the format as disabled.
181         CFormatHints& AddDisabledFormat(TFormat fmt);
182         /// Disable all formats not marked as preferred
183         CFormatHints& DisableAllNonpreferred(void);
184         /// Remove format hint.
185         void RemoveFormat(TFormat fmt);
186         /// Remove all hints
187         CFormatHints& Reset(void);
188 
189         /// Check if there are any hints are set at all.
190         bool IsEmpty(void) const;
191         /// Check if the format is listed as preferred.
192         bool IsPreferred(TFormat fmt) const;
193         /// Check if the format is listed as disabled.
194         bool IsDisabled(TFormat fmt) const;
195 
196     private:
197         typedef bitset<CFormatGuess::eFormat_max> THints;
198 
199         THints m_Preferred;
200         THints m_Disabled;
201     };
202 
203     /// Guess sequence type. Function calculates sequence alphabet and
204     /// identifies if the source belongs to nucleotide or protein sequence
205     static ESequenceType SequenceType(const char* str, unsigned length = 0,
206                                       ESTStrictness strictness = eST_Default);
207 
208     static const char* GetFormatName(EFormat format);
209 
210     //  ----------------------------------------------------------------------
211     //  "Stateless" interface:
212     //  Useful for checking for all formats in one simple call.
213     //  May go away; use object interface instead.
214     //  ----------------------------------------------------------------------
215 
216     /// Guess file format
217     static
218     EFormat Format(const string& path, EOnError onerror = eDefault);
219 
220     /// Format prediction based on an input stream
221     /// @note On completion, the function pushes whatever data it had to read
222     ///       (in order to detect data format) back to the stream -- using
223     ///       CStreamUtils::Stepback()
224     static
225     EFormat Format(CNcbiIstream& input, EOnError onerror = eDefault);
226 
227 
228     //  ----------------------------------------------------------------------
229     //  "Object" interface:
230     //  Use when interested only in a limited number of formats, in excluding
231     //  certain tests, a specific order in which formats are tested, ...
232     //  ----------------------------------------------------------------------
233 
234     CFormatGuess();
235 
236     CFormatGuess(const string& fname);
237 
238     /// @note Data format detection methods GuessFormat() and TestFormat()
239     ///       take care to push whatever data they read back to the stream
240     ///       using CStreamUtils::Stepback()
241     CFormatGuess(CNcbiIstream& input);
242 
243     ~CFormatGuess();
244 
245 
246     NCBI_DEPRECATED EFormat GuessFormat(EMode);
247     NCBI_DEPRECATED bool TestFormat(EFormat, EMode);
248 
249     /// @note If the instance of the class is built upon std::istream, then
250     ///       on completion this function pushes whatever data it had to read
251     ///       (in order to detect data format) back to the stream -- using
252     ///       CStreamUtils::Stepback()
253     EFormat GuessFormat(EOnError onerror = eDefault);
254 
255 
256     /// @note If the instance of the class is built upon std::istream, then
257     ///       on completion this function pushes whatever data it had to read
258     ///       (in order to detect data format) back to the stream -- using
259     ///       CStreamUtils::Stepback()
260     bool TestFormat(EFormat, EOnError onerror = eDefault);
261 
262     /// Get format hints
GetFormatHints(void)263     CFormatHints& GetFormatHints(void) { return m_Hints; }
264 
265     /// Check whether testing is enabled for given format
IsEnabled(EFormat format) const266     bool IsEnabled(EFormat format) const { return !m_Hints.IsDisabled(format); };
267 
268 protected:
269     void Initialize();
270 
271     bool EnsureTestBuffer();
272     bool EnsureStats();
273     bool EnsureSplitLines();
274     bool IsAllComment();
275     bool IsAsciiText();
276 
277     bool TestFormatRepeatMasker(EMode);
278     bool TestFormatPhrapAce(EMode);
279     bool TestFormatGtf(EMode);
280     bool TestFormatGvf(EMode);
281     bool TestFormatGff3(EMode);
282     bool TestFormatGff2(EMode);
283     bool TestFormatGlimmer3(EMode);
284     bool TestFormatAgp(EMode);
285     bool TestFormatNewick(EMode);
286     bool TestFormatXml(EMode);
287     bool TestFormatAlignment(EMode);
288     bool TestFormatCLUSTAL(void);
289     bool TestFormatBinaryAsn(EMode);
290     bool TestFormatDistanceMatrix(EMode);
291     bool TestFormatTaxplot(EMode);
292     bool TestFormatFlatFileSequence(EMode);
293     bool TestFormatFiveColFeatureTable(EMode);
294     bool TestFormatTable(EMode);
295     bool TestFormatFasta(EMode);
296     bool TestFormatTextAsn(EMode);
297     bool TestFormatSnpMarkers(EMode);
298     bool TestFormatBed(EMode);
299     bool TestFormatBed15(EMode);
300     bool TestFormatWiggle(EMode);
301     bool TestFormatHgvs(EMode);
302     bool TestFormatZip(EMode);
303     bool TestFormatGZip(EMode);
304     bool TestFormatBZip2(EMode);
305     bool TestFormatLzo(EMode);
306     bool TestFormatSra(EMode);
307     bool TestFormatBam(EMode);
308     bool TestFormatVcf(EMode);
309     bool TestFormatAugustus(EMode);
310     bool TestFormatJson(EMode);
311     bool TestFormatPsl(EMode);
312 
313     bool TestFormatFlatFileGenbank(EMode);
314     bool TestFormatFlatFileEna(EMode);
315     bool TestFormatFlatFileUniProt(EMode);
316 
317     bool IsInputRepeatMaskerWithoutHeader();
318     bool IsInputRepeatMaskerWithHeader();
319 
320     static bool IsLineFlatFileSequence(const std::string&);
321     static bool IsSampleNewick(const std::string&);
322     static bool IsLabelNewick(const std::string&);
323     static bool IsLineAgp(const std::string&);
324     static bool IsLineGlimmer3(const std::string&);
325     static bool IsLineGtf(const std::string&);
326     static bool IsLineGvf(const std::string&);
327     static bool IsLineGff3(const std::string&);
328     static bool IsLineGff2(const std::string&);
329     static bool IsLineAugustus(const std::string&);
330     static bool IsLinePhrapId(const std::string&);
331     static bool IsLineRmo(const std::string&);
332     static bool IsAsnComment(const vector<string>&);
333     static bool IsLineHgvs(const std::string&);
334     static bool IsLinePsl(const std::string&, bool ignoreFirstColumn);
335 
336 private:
337     static bool x_TestInput( CNcbiIstream& input, EOnError onerror );
338 
339     bool x_TestFormat(EFormat format, EMode mode);
340 
341     // to test for a table we check each of the most common delimiter combitions,
342     // ' ' ' \t' '\t' ',' '|'
343     bool x_TestTableDelimiter(const string& delims);
344 
345     // Check that the beginning of testString looks like JSON
346     bool x_CheckJsonStart(const string& testString) const;
347 
348     // In-place deletion of JSON strings
349     void x_StripJsonStrings(string& testString) const;
350 
351     // Starting at from_pos, find the next set of double quotes
352     // indicating the end of a JSON string
353     size_t x_FindNextJsonStringStop(const string& input, const size_t from_pos) const;
354 
355     void x_FindJsonStringLimits(const string& testString, list<size_t>& limits) const;
356 
357     // Checks and removes punctuation from testString
358     bool x_CheckStripJsonPunctuation(string& testString) const;
359 
360     // In-place deletion of JSON punctuation
361     // Returns the number of characters deleted.
362     size_t x_StripJsonPunctuation(string& testString) const;
363 
364     // In-place deletion of JSON keywords: true, false, null
365     void x_StripJsonKeywords(string& testString) const;
366 
367     bool x_CheckStripJsonNumbers(string& testString) const;
368 
369     bool x_IsTruncatedJsonNumber(const string& testString) const;
370 
371     // Is a truncation of true, false, or null
372     bool x_IsTruncatedJsonKeyword(const string& testString) const;
373 
374     bool x_IsNumber(const string& testString) const;
375 
376     // Return true if the string is blank or a list of space-delimited numbers
377     bool x_IsBlankOrNumbers(const string& testString) const;
378 
379     // data:
380     using NAME_MAP = map<EFormat, const char*>;
381     static const NAME_MAP sm_FormatNames;
382 
383     bool x_TryProcessCLUSTALSeqData(const string& line, string& id, size_t& seg_length) const;
384 
385     bool x_LooksLikeCLUSTALConservedInfo(const string& line) const;
386 
387 protected:
388     static vector<int> sm_CheckOrder;
389 
390     static const streamsize s_iTestBufferGranularity = 8096;
391 
392 
393     CNcbiIstream& m_Stream;
394     bool m_bOwnsStream;
395     char* m_pTestBuffer;
396     streamsize m_iTestBufferSize;
397     streamsize m_iTestDataSize;
398 
399     bool m_bStatsAreValid;
400     bool m_bSplitDone;
401     unsigned int m_iStatsCountData;
402     unsigned int m_iStatsCountAlNumChars;
403     unsigned int m_iStatsCountDnaChars;
404     unsigned int m_iStatsCountAaChars;
405     unsigned int m_iStatsCountBraces;
406     std::list<std::string> m_TestLines;
407     CFormatHints m_Hints;
408 };
409 
410 
411 inline CFormatGuess::CFormatHints&
AddPreferredFormat(TFormat fmt)412 CFormatGuess::CFormatHints::AddPreferredFormat(TFormat fmt)
413 {
414     m_Disabled.reset(fmt);
415     m_Preferred.set(fmt);
416     return *this;
417 }
418 
419 
420 inline CFormatGuess::CFormatHints&
AddDisabledFormat(TFormat fmt)421 CFormatGuess::CFormatHints::AddDisabledFormat(TFormat fmt)
422 {
423     m_Preferred.reset(fmt);
424     m_Disabled.set(fmt);
425     return *this;
426 }
427 
428 inline CFormatGuess::CFormatHints&
DisableAllNonpreferred(void)429 CFormatGuess::CFormatHints::DisableAllNonpreferred(void)
430 {
431     m_Disabled = ~m_Preferred;
432     return *this;
433 }
434 
RemoveFormat(TFormat fmt)435 inline void CFormatGuess::CFormatHints::RemoveFormat(TFormat fmt)
436 {
437     m_Disabled.reset(fmt);
438     m_Preferred.reset(fmt);
439 }
440 
441 inline CFormatGuess::CFormatHints&
Reset(void)442 CFormatGuess::CFormatHints::Reset(void)
443 {
444     m_Preferred.reset();
445     m_Disabled.reset();
446     return *this;
447 }
448 
IsEmpty(void) const449 inline bool CFormatGuess::CFormatHints::IsEmpty(void) const
450 {
451     return m_Preferred.count() == 0  &&  m_Disabled.count() == 0;
452 }
453 
IsPreferred(TFormat fmt) const454 inline bool CFormatGuess::CFormatHints::IsPreferred(TFormat fmt) const
455 {
456     return m_Preferred.test(fmt);
457 }
458 
IsDisabled(TFormat fmt) const459 inline bool CFormatGuess::CFormatHints::IsDisabled(TFormat fmt) const
460 {
461     return m_Disabled.test(fmt);
462 }
463 
464 END_NCBI_SCOPE
465 
466 #endif
467