1 /*  $Id: formatguess.cpp 629210 2021-04-12 18:51:59Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Frank Ludwig, NCBI
27 *
28 * File Description:
29 *   Test application for the CFormatGuess component
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbienv.hpp>
37 #include <corelib/ncbiargs.hpp>
38 
39 #include <serial/objistrasnb.hpp>
40 #include <serial/objistrasn.hpp>
41 #include <serial/objistrxml.hpp>
42 #include <serial/objistrjson.hpp>
43 
44 #include <objects/seqset/Seq_entry.hpp>
45 #include <objects/submit/Seq_submit.hpp>
46 #include <objects/seqset/Bioseq_set.hpp>
47 #include <objects/seqalign/Seq_align.hpp>
48 #include <objects/seq/Seq_annot.hpp>
49 
50 #include <objects/seq/Bioseq.hpp>
51 
52 #include <misc/xmlwrapp/attributes.hpp>
53 #include <misc/xmlwrapp/document.hpp>
54 #include <misc/xmlwrapp/node.hpp>
55 
56 #include <objtools/readers/format_guess_ex.hpp>
57 
58 typedef std::map<ncbi::CFormatGuess::EFormat, std::string> FormatMap;
59 typedef FormatMap::iterator FormatIter;
60 
61 USING_NCBI_SCOPE;
62 USING_SCOPE(objects);
63 
64 set<TTypeInfo> sDefaultRecognizedGenbankObjectTypes = {
65     CType<CBioseq>().GetTypeInfo(),
66     CType<CBioseq_set>().GetTypeInfo(),
67     CType<CSeq_align>().GetTypeInfo(),
68     CType<CSeq_annot>().GetTypeInfo(),
69     CType<CSeq_entry>().GetTypeInfo(),
70     CType<CSeq_submit>().GetTypeInfo(),
71 };
72 
73 
74 
75 //  ============================================================================
76 class CFormatGuessApp
77 //  ============================================================================
78      : public CNcbiApplication
79 {
80 private:
81     static string guess_object_type(CObjectIStream & obj_istrm);
82 
83     virtual void Init(void);
84     virtual int  Run(void);
85     virtual void Exit(void);
86 };
87 
88 /*
89 //  ============================================================================
90 string CFormatGuessApp::guess_object_type(CObjectIStream & obj_istrm)
91 //  ============================================================================
92 {
93     set<TTypeInfo> known_types = {
94         CType<CSeq_entry>().GetTypeInfo(),
95         CType<CSeq_submit>().GetTypeInfo(),
96         CType<CBioseq_set>().GetTypeInfo(),
97         CType<CBioseq>().GetTypeInfo()
98     };
99 
100     set<TTypeInfo> types = obj_istrm.GuessDataType(known_types);
101     if ( types.size() != 1 ) {
102         return "unknown";
103     }
104     return (*types.begin())->GetName();
105 }
106 */
107 
108 //  ============================================================================
Init(void)109 void CFormatGuessApp::Init(void)
110 //  ============================================================================
111 {
112     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
113 
114     arg_desc->SetUsageContext
115         (GetArguments().GetProgramBasename(),
116          "CFormatGuess front end: Guess various file formats");
117 
118     //
119     //  shared flags and parameters:
120     //
121     arg_desc->AddDefaultKey
122         ("i", "InputFile",
123          "Input File Name or '-' for stdin.",
124          CArgDescriptions::eInputFile,
125          "-");
126 
127     arg_desc->AddFlag(
128         "canonical-name",
129         "Use the canonical name which is the name of the format as "
130         "given by the underlying C++ format guesser.");
131 
132     arg_desc->AddFlag(
133         "show-object-type",
134         "Make output include the type of the object.   If it cannot be "
135         "determined or does not make sense for the given format then it "
136         "considers it 'unknown'"
137     );
138 
139     arg_desc->AddDefaultKey(
140         "output-format", "OutputFormat",
141         "How this program should send the results of its guesses.",
142         CArgDescriptions::eString,
143         "text"
144     );
145     arg_desc->SetConstraint("output-format", &(*new CArgAllow_Strings,
146                                                "text", "XML"));
147 
148 
149     SetupArgDescriptions(arg_desc.release());
150 }
151 
152 //  ============================================================================
153 int
Run(void)154 CFormatGuessApp::Run(void)
155 //  ============================================================================
156 {
157     const CArgs& args = GetArgs();
158     //CNcbiIstream & input_stream = args["i"].AsInputFile(CArgValue::fBinary);
159     string name_of_input_stream = args["i"].AsString();
160     if( name_of_input_stream.empty() || name_of_input_stream == "-" ) {
161         name_of_input_stream = "stdin";
162     }
163 
164     CFormatGuessEx guesser( name_of_input_stream );
165     CFileContentInfo contentInfo;
166     guesser.SetRecognizedGenbankTypes(sDefaultRecognizedGenbankObjectTypes);
167     CFormatGuess::EFormat uFormat = guesser.GuessFormatAndContent(contentInfo);
168 
169     string format_name;
170     if( args["canonical-name"] ) {
171         // caller wants to always use the format-guesser's name
172         format_name = CFormatGuess::GetFormatName(uFormat);
173     } else {
174         // caller wants special names for some types
175         FormatMap FormatStrings;
176         FormatStrings[ CFormatGuess::eUnknown ] = "Format not recognized";
177         FormatStrings[ CFormatGuess::eBinaryASN ] = "Binary ASN.1";
178         FormatStrings[ CFormatGuess::eTextASN ] = "Text ASN.1";
179         FormatStrings[ CFormatGuess::eFasta ] = "FASTA sequence record";
180         FormatStrings[ CFormatGuess::eXml ] = "XML";
181         FormatStrings[ CFormatGuess::eRmo ] = "RepeatMasker Out";
182         FormatStrings[ CFormatGuess::eGlimmer3 ] = "Glimmer3 prediction";
183         FormatStrings[ CFormatGuess::ePhrapAce ] = "Phrap ACE assembly file";
184         FormatStrings[ CFormatGuess::eGtf ] = "GFF/GTF style annotation";
185         FormatStrings[ CFormatGuess::eAgp ] = "AGP format assembly";
186         FormatStrings[ CFormatGuess::eNewick ] = "Newick tree";
187         FormatStrings[ CFormatGuess::eDistanceMatrix ] = "Distance matrix";
188         FormatStrings[ CFormatGuess::eFiveColFeatureTable ] =
189             "Five column feature table";
190         FormatStrings[ CFormatGuess::eTaxplot ] = "Tax plot";
191         FormatStrings[ CFormatGuess::eTable ] = "Generic table";
192         FormatStrings[ CFormatGuess::eAlignment ] = "Text alignment";
193         FormatStrings[ CFormatGuess::eFlatFileSequence ] =
194             "Flat file sequence portion";
195         FormatStrings[ CFormatGuess::eSnpMarkers ] = "SNP marker flat file";
196         FormatStrings[ CFormatGuess::eWiggle ] = "UCSC Wiggle file";
197         FormatStrings[ CFormatGuess::eBed ] = "UCSC BED file";
198         FormatStrings[ CFormatGuess::eBed15 ] = "UCSC microarray file";
199         FormatStrings[ CFormatGuess::eHgvs ] = "HGVS Variation file";
200         FormatStrings[ CFormatGuess::eGff2 ] = "GFF2 feature table";
201         FormatStrings[ CFormatGuess::eGff3 ] = "GFF3 feature table";
202         FormatStrings[ CFormatGuess::eGvf ] = "GVF gene variation data";
203         FormatStrings[ CFormatGuess::eVcf ] = "VCF Variant Call Format";
204 
205         FormatIter it = FormatStrings.find( uFormat );
206         if ( it == FormatStrings.end() ) {
207             // cout << "Unmapped format [" << uFormat << "]";
208             format_name = CFormatGuess::GetFormatName(uFormat);
209         }
210         else {
211             format_name = it->second;
212         }
213     }
214 
215     string object_type_to_show("unknown");
216     if( args["show-object-type"] ) {
217         switch(uFormat) {
218         default:
219             break;
220         case CFormatGuess::eTextASN:
221         case CFormatGuess::eBinaryASN:
222         case CFormatGuess::eJSON:
223         case CFormatGuess::eXml:
224             object_type_to_show = contentInfo.mInfoGenbank.mObjectType;
225             break;
226         }
227     }
228 
229     const string output_format = args["output-format"].AsString();
230 
231     if( output_format == "text" ) {
232         cout << name_of_input_stream << " :   ";
233 
234         _ASSERT( ! format_name.empty() ); // should be non-empty even if unknown
235         cout << format_name;
236 
237         // second line is object type line, if applicable.
238         if( ! object_type_to_show.empty() ) {
239             cout << ", object type:   " << object_type_to_show;
240         }
241 
242         cout << endl;
243     } else if( output_format == "XML" ) {
244 
245         xml::node output_node("formatguess");
246 
247         // input_stream_node for each input specified.
248         // However, there's currently only one so no loop here yet.
249         xml::node input_stream_node("input_stream");
250         xml::attributes & stream_attribs = input_stream_node.get_attributes();
251         stream_attribs.insert("name", name_of_input_stream.c_str());
252         stream_attribs.insert("format_name", format_name.c_str());
253          if( ! object_type_to_show.empty() ) {
254             stream_attribs.insert("object_type", object_type_to_show.c_str());
255          }
256 
257         output_node.push_back(input_stream_node);
258 
259         xml::document output_doc(output_node);
260         output_doc.save_to_stream(cout);
261     } else {
262         _TROUBLE;
263     }
264 
265     return 0;
266 }
267 
268 //  ============================================================================
Exit(void)269 void CFormatGuessApp::Exit(void)
270 //  ============================================================================
271 {
272     SetDiagStream(0);
273 }
274 
275 //  ============================================================================
main(int argc,const char * argv[])276 int main(int argc, const char* argv[])
277 //  ============================================================================
278 {
279     // Execute main application function
280     return CFormatGuessApp().AppMain(argc, argv, 0, eDS_Default, 0);
281 }
282 
283