1 /* $Id: formatguess.cpp 629210 2021-04-12 18:51:59Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Frank Ludwig, NCBI
27 *
28 * File Description:
29 * Test application for the CFormatGuess component
30 *
31 * ===========================================================================
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbienv.hpp>
37 #include <corelib/ncbiargs.hpp>
38
39 #include <serial/objistrasnb.hpp>
40 #include <serial/objistrasn.hpp>
41 #include <serial/objistrxml.hpp>
42 #include <serial/objistrjson.hpp>
43
44 #include <objects/seqset/Seq_entry.hpp>
45 #include <objects/submit/Seq_submit.hpp>
46 #include <objects/seqset/Bioseq_set.hpp>
47 #include <objects/seqalign/Seq_align.hpp>
48 #include <objects/seq/Seq_annot.hpp>
49
50 #include <objects/seq/Bioseq.hpp>
51
52 #include <misc/xmlwrapp/attributes.hpp>
53 #include <misc/xmlwrapp/document.hpp>
54 #include <misc/xmlwrapp/node.hpp>
55
56 #include <objtools/readers/format_guess_ex.hpp>
57
58 typedef std::map<ncbi::CFormatGuess::EFormat, std::string> FormatMap;
59 typedef FormatMap::iterator FormatIter;
60
61 USING_NCBI_SCOPE;
62 USING_SCOPE(objects);
63
64 set<TTypeInfo> sDefaultRecognizedGenbankObjectTypes = {
65 CType<CBioseq>().GetTypeInfo(),
66 CType<CBioseq_set>().GetTypeInfo(),
67 CType<CSeq_align>().GetTypeInfo(),
68 CType<CSeq_annot>().GetTypeInfo(),
69 CType<CSeq_entry>().GetTypeInfo(),
70 CType<CSeq_submit>().GetTypeInfo(),
71 };
72
73
74
75 // ============================================================================
76 class CFormatGuessApp
77 // ============================================================================
78 : public CNcbiApplication
79 {
80 private:
81 static string guess_object_type(CObjectIStream & obj_istrm);
82
83 virtual void Init(void);
84 virtual int Run(void);
85 virtual void Exit(void);
86 };
87
88 /*
89 // ============================================================================
90 string CFormatGuessApp::guess_object_type(CObjectIStream & obj_istrm)
91 // ============================================================================
92 {
93 set<TTypeInfo> known_types = {
94 CType<CSeq_entry>().GetTypeInfo(),
95 CType<CSeq_submit>().GetTypeInfo(),
96 CType<CBioseq_set>().GetTypeInfo(),
97 CType<CBioseq>().GetTypeInfo()
98 };
99
100 set<TTypeInfo> types = obj_istrm.GuessDataType(known_types);
101 if ( types.size() != 1 ) {
102 return "unknown";
103 }
104 return (*types.begin())->GetName();
105 }
106 */
107
108 // ============================================================================
Init(void)109 void CFormatGuessApp::Init(void)
110 // ============================================================================
111 {
112 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
113
114 arg_desc->SetUsageContext
115 (GetArguments().GetProgramBasename(),
116 "CFormatGuess front end: Guess various file formats");
117
118 //
119 // shared flags and parameters:
120 //
121 arg_desc->AddDefaultKey
122 ("i", "InputFile",
123 "Input File Name or '-' for stdin.",
124 CArgDescriptions::eInputFile,
125 "-");
126
127 arg_desc->AddFlag(
128 "canonical-name",
129 "Use the canonical name which is the name of the format as "
130 "given by the underlying C++ format guesser.");
131
132 arg_desc->AddFlag(
133 "show-object-type",
134 "Make output include the type of the object. If it cannot be "
135 "determined or does not make sense for the given format then it "
136 "considers it 'unknown'"
137 );
138
139 arg_desc->AddDefaultKey(
140 "output-format", "OutputFormat",
141 "How this program should send the results of its guesses.",
142 CArgDescriptions::eString,
143 "text"
144 );
145 arg_desc->SetConstraint("output-format", &(*new CArgAllow_Strings,
146 "text", "XML"));
147
148
149 SetupArgDescriptions(arg_desc.release());
150 }
151
152 // ============================================================================
153 int
Run(void)154 CFormatGuessApp::Run(void)
155 // ============================================================================
156 {
157 const CArgs& args = GetArgs();
158 //CNcbiIstream & input_stream = args["i"].AsInputFile(CArgValue::fBinary);
159 string name_of_input_stream = args["i"].AsString();
160 if( name_of_input_stream.empty() || name_of_input_stream == "-" ) {
161 name_of_input_stream = "stdin";
162 }
163
164 CFormatGuessEx guesser( name_of_input_stream );
165 CFileContentInfo contentInfo;
166 guesser.SetRecognizedGenbankTypes(sDefaultRecognizedGenbankObjectTypes);
167 CFormatGuess::EFormat uFormat = guesser.GuessFormatAndContent(contentInfo);
168
169 string format_name;
170 if( args["canonical-name"] ) {
171 // caller wants to always use the format-guesser's name
172 format_name = CFormatGuess::GetFormatName(uFormat);
173 } else {
174 // caller wants special names for some types
175 FormatMap FormatStrings;
176 FormatStrings[ CFormatGuess::eUnknown ] = "Format not recognized";
177 FormatStrings[ CFormatGuess::eBinaryASN ] = "Binary ASN.1";
178 FormatStrings[ CFormatGuess::eTextASN ] = "Text ASN.1";
179 FormatStrings[ CFormatGuess::eFasta ] = "FASTA sequence record";
180 FormatStrings[ CFormatGuess::eXml ] = "XML";
181 FormatStrings[ CFormatGuess::eRmo ] = "RepeatMasker Out";
182 FormatStrings[ CFormatGuess::eGlimmer3 ] = "Glimmer3 prediction";
183 FormatStrings[ CFormatGuess::ePhrapAce ] = "Phrap ACE assembly file";
184 FormatStrings[ CFormatGuess::eGtf ] = "GFF/GTF style annotation";
185 FormatStrings[ CFormatGuess::eAgp ] = "AGP format assembly";
186 FormatStrings[ CFormatGuess::eNewick ] = "Newick tree";
187 FormatStrings[ CFormatGuess::eDistanceMatrix ] = "Distance matrix";
188 FormatStrings[ CFormatGuess::eFiveColFeatureTable ] =
189 "Five column feature table";
190 FormatStrings[ CFormatGuess::eTaxplot ] = "Tax plot";
191 FormatStrings[ CFormatGuess::eTable ] = "Generic table";
192 FormatStrings[ CFormatGuess::eAlignment ] = "Text alignment";
193 FormatStrings[ CFormatGuess::eFlatFileSequence ] =
194 "Flat file sequence portion";
195 FormatStrings[ CFormatGuess::eSnpMarkers ] = "SNP marker flat file";
196 FormatStrings[ CFormatGuess::eWiggle ] = "UCSC Wiggle file";
197 FormatStrings[ CFormatGuess::eBed ] = "UCSC BED file";
198 FormatStrings[ CFormatGuess::eBed15 ] = "UCSC microarray file";
199 FormatStrings[ CFormatGuess::eHgvs ] = "HGVS Variation file";
200 FormatStrings[ CFormatGuess::eGff2 ] = "GFF2 feature table";
201 FormatStrings[ CFormatGuess::eGff3 ] = "GFF3 feature table";
202 FormatStrings[ CFormatGuess::eGvf ] = "GVF gene variation data";
203 FormatStrings[ CFormatGuess::eVcf ] = "VCF Variant Call Format";
204
205 FormatIter it = FormatStrings.find( uFormat );
206 if ( it == FormatStrings.end() ) {
207 // cout << "Unmapped format [" << uFormat << "]";
208 format_name = CFormatGuess::GetFormatName(uFormat);
209 }
210 else {
211 format_name = it->second;
212 }
213 }
214
215 string object_type_to_show("unknown");
216 if( args["show-object-type"] ) {
217 switch(uFormat) {
218 default:
219 break;
220 case CFormatGuess::eTextASN:
221 case CFormatGuess::eBinaryASN:
222 case CFormatGuess::eJSON:
223 case CFormatGuess::eXml:
224 object_type_to_show = contentInfo.mInfoGenbank.mObjectType;
225 break;
226 }
227 }
228
229 const string output_format = args["output-format"].AsString();
230
231 if( output_format == "text" ) {
232 cout << name_of_input_stream << " : ";
233
234 _ASSERT( ! format_name.empty() ); // should be non-empty even if unknown
235 cout << format_name;
236
237 // second line is object type line, if applicable.
238 if( ! object_type_to_show.empty() ) {
239 cout << ", object type: " << object_type_to_show;
240 }
241
242 cout << endl;
243 } else if( output_format == "XML" ) {
244
245 xml::node output_node("formatguess");
246
247 // input_stream_node for each input specified.
248 // However, there's currently only one so no loop here yet.
249 xml::node input_stream_node("input_stream");
250 xml::attributes & stream_attribs = input_stream_node.get_attributes();
251 stream_attribs.insert("name", name_of_input_stream.c_str());
252 stream_attribs.insert("format_name", format_name.c_str());
253 if( ! object_type_to_show.empty() ) {
254 stream_attribs.insert("object_type", object_type_to_show.c_str());
255 }
256
257 output_node.push_back(input_stream_node);
258
259 xml::document output_doc(output_node);
260 output_doc.save_to_stream(cout);
261 } else {
262 _TROUBLE;
263 }
264
265 return 0;
266 }
267
268 // ============================================================================
Exit(void)269 void CFormatGuessApp::Exit(void)
270 // ============================================================================
271 {
272 SetDiagStream(0);
273 }
274
275 // ============================================================================
main(int argc,const char * argv[])276 int main(int argc, const char* argv[])
277 // ============================================================================
278 {
279 // Execute main application function
280 return CFormatGuessApp().AppMain(argc, argv, 0, eDS_Default, 0);
281 }
282
283