1 /*  $Id: blastdbcmd.cpp 631510 2021-05-19 13:47:40Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blastdbcmd.cpp
31  * Command line tool to examine the contents of BLAST databases. This is the
32  * successor to fastacmd from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <algo/blast/api/version.hpp>
38 #include <objtools/blast/seqdb_reader/seqdbexpert.hpp>
39 #include <objtools/blast/seqdb_reader/impl/seqdbtax.hpp>
40 #include <algo/blast/api/blast_exception.hpp>
41 #include <algo/blast/blastinput/blast_input_aux.hpp>
42 #include <objtools/blast/blastdb_format/seq_formatter.hpp>
43 #include <objtools/blast/blastdb_format/blastdb_formatter.hpp>
44 #include <objtools/blast/blastdb_format/blastdb_seqid.hpp>
45 #include <algo/blast/blastinput/blast_input.hpp>
46 #include <objects/seqloc/PDB_seq_id.hpp>
47 #include "../blast/blast_app_util.hpp"
48 #include <iomanip>
49 
50 
51 #ifndef SKIP_DOXYGEN_PROCESSING
52 USING_NCBI_SCOPE;
53 USING_SCOPE(blast);
54 #endif
55 
56 static const string NA = "N/A";
57 
58 /// The application class
59 class CBlastDBCmdApp : public CNcbiApplication
60 {
61 public:
62     /** @inheritDoc */
CBlastDBCmdApp()63     CBlastDBCmdApp() {
64         CRef<CVersion> version(new CVersion());
65         version->SetVersionInfo(new CBlastVersion());
66         SetFullVersion(version);
67         m_StopWatch.Start();
68         if (m_UsageReport.IsEnabled()) {
69         	m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
70         	m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "blastdbcmd");
71         }
72     }
~CBlastDBCmdApp()73     ~CBlastDBCmdApp() {
74     	m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
75     }
76 private:
77     /** @inheritDoc */
78     virtual void Init();
79     /** @inheritDoc */
80     virtual int Run();
81 
82     /// Handle to BLAST database
83     CRef<CSeqDBExpert> m_BlastDb;
84     /// Is the database protein
85     bool m_DbIsProtein;
86     /// output is FASTA
87     bool m_FASTA;
88     /// output is ASN.1 defline
89     bool m_Asn1Bioseq;
90     /// should we find duplicate entries?
91     bool m_GetDuplicates;
92     /// should we output target sequence only?
93     bool m_TargetOnly;
94 
95     CBlastDB_FormatterConfig m_Config;
96 
97     set<TTaxId> m_TaxIdList;
98 
99     CBlastUsageReport m_UsageReport;
100     CStopWatch m_StopWatch;
101 
102     /// Initializes Blast DB
103     void x_InitBlastDB();
104     void x_InitBlastDB_TaxIdList();
105 
106     string x_InitSearchRequest();
107 
108     /// Prints the BLAST database information (e.g.: handles -info command line
109     /// option)
110     void x_PrintBlastDatabaseInformation();
111 
112     /// Processes all requests except printing the BLAST database information
113     /// @return 0 on success; 1 if some sequences were not retrieved
114     int x_ProcessSearchRequest();
115 
116     /// Process batch entry with range, strand and filter id
117     /// @param args program input args
118     /// @param seq_fmt sequence formatter object
119     /// @return 0 on sucess; 1 if some queries were not processed
120     int x_ProcessBatchEntry(CBlastDB_Formatter & seq_fmt);
121 
122     int x_ProcessBatchEntry_NoDup(CBlastDB_Formatter & fmt);
123 
124     /// Process entry with range, strand and filter id
125     /// @param args program input args
126     /// @param seq_fmt sequence formatter object
127     /// @return 0 on sucess; 1 if some queries were not processed
128     int x_ProcessEntry(CBlastDB_Formatter & fmt);
129 
130     int x_ProcessTaxIdList(CBlastDB_Formatter & fmt);
131 
132     int x_ProcessSearchType(CBlastDB_Formatter & fmt);
133 
134     bool x_GetOids(const string & acc, vector<int> & oids);
135 
136     int x_ModifyConfigForBatchEntry(const string & config);
137 
138     bool x_UseLongSeqIds();
139 
140     void x_PrintBlastDatabaseTaxInformation();
141 
142     int x_ProcessBatchPig(CBlastDB_Formatter & fmt);
143 
144     void x_AddCmdOptions();
145 };
146 
147 
s_PreProcessAccessionsForDBv5(const string & id)148 string s_PreProcessAccessionsForDBv5(const string & id)
149 {
150 	string rv = id;
151 	if ((id.find('|') != NPOS) || (id.find('_') != NPOS)) {
152 
153 		CRef<CSeq_id> seqid;
154 		try {
155 			seqid = new CSeq_id(id, CSeq_id::fParse_RawText | CSeq_id::fParse_AnyLocal | CSeq_id::fParse_PartialOK);
156 		}
157 		catch(...) {
158 		}
159 
160 		if(seqid.NotEmpty()) {
161 			if(seqid->IsPir() || seqid->IsPrf()) {
162 				return seqid->AsFastaString();
163 			}
164 			else if (seqid->IsPdb()) {
165 				string tmp = seqid->GetSeqIdString();
166 				rv = tmp.substr(0,4);
167 				rv += tmp.substr(4);
168 				return (rv);
169 			}
170 			return seqid->GetSeqIdString(true);
171 		}
172 	}
173 
174 	return NStr::ToUpper(rv);
175 
176 }
177 
178 
179 bool
x_GetOids(const string & id,vector<int> & oids)180 CBlastDBCmdApp::x_GetOids(const string & id, vector<int> & oids)
181 {
182 	string acc = id;
183 	if(m_BlastDb->GetBlastDbVersion() == EBlastDbVersion::eBDB_Version5) {
184 		acc = s_PreProcessAccessionsForDBv5(id);
185 	}
186 	TGi num_id = NStr::StringToNumeric<TGi>(acc, NStr::fConvErr_NoThrow);
187 	if(!errno) {
188 		int gi_oid = -1;
189 		m_BlastDb->GiToOidwFilterCheck(num_id, gi_oid);
190 		if(gi_oid < 0) {
191 			m_BlastDb->AccessionToOids(acc, oids);
192 		}
193 		else {
194 			oids.push_back(gi_oid);
195 		}
196 
197 	}
198 	else {
199 		m_BlastDb->AccessionToOids(acc, oids);
200 	}
201 	if(oids.empty()) {
202 		ERR_POST(Error <<  "Entry not found: " << acc);
203 		return false;
204 	}
205 	return true;
206 }
207 
208 int
x_ProcessEntry(CBlastDB_Formatter & fmt)209 CBlastDBCmdApp::x_ProcessEntry(CBlastDB_Formatter & fmt)
210 {
211 	unsigned int err_found = 0;
212     const CArgs& args = GetArgs();
213     _ASSERT(m_BlastDb.NotEmpty());
214 
215    	if (args["ipg"].HasValue()) {
216     	CSeqDB::TOID oid;
217     	m_BlastDb->PigToOid(args["ipg"].AsInteger(),oid);
218    		fmt.Write(oid, m_Config);
219     } else if (args["entry"].HasValue()) {
220     	static const string kDelim(",");
221     	const string& entry = args["entry"].AsString();
222 
223     	vector<string> queries;
224        	if (entry.find(kDelim[0]) != string::npos) {
225            	NStr::Split(entry, kDelim, queries);
226        	} else {
227        		queries.resize(1);
228        		queries[0]  = entry;
229      	}
230    		for(unsigned int i=0; i < queries.size(); i++) {
231      		vector<CSeqDB::TOID> oids;
232      		if(x_GetOids(queries[i], oids)) {
233    				for(unsigned int j=0; j < oids.size(); j++) {
234    					if(m_TargetOnly) {
235    						fmt.Write(oids[j], m_Config, queries[i]);
236    					}
237    					else {
238    						fmt.Write(oids[j], m_Config);
239    					}
240      			}
241      		}
242      		else {
243      			err_found ++;
244      		}
245      	}
246    		if(err_found == queries.size()) {
247    			NCBI_THROW(CInputException, eInvalidInput,
248    		               "Entry or entries not found in BLAST database");
249    		}
250     }
251    	return (err_found) ? 1:0;
252 }
253 
s_IsMaskAlgoIdValid(CSeqDB & blastdb,int id)254 bool s_IsMaskAlgoIdValid(CSeqDB & blastdb, int id)
255 {
256 	if (id >= 0) {
257 	    vector<int> algo_id(1, id);
258 	    vector<int> invalid_algo_ids = blastdb.ValidateMaskAlgorithms(algo_id);
259 	    if ( !invalid_algo_ids.empty()) {
260 	    	ERR_POST(Error << "Invalid filtering algorithm ID: " << NStr::IntToString(id));
261 	    	return false;
262 	    }
263 	}
264 	return true;
265 }
266 
x_ModifyConfigForBatchEntry(const string & format)267 int CBlastDBCmdApp::x_ModifyConfigForBatchEntry(const string & format)
268 {
269 	int status = 0;
270 	if (!m_DbIsProtein) {
271 		m_Config.m_Strand = eNa_strand_plus;
272 	}
273    	m_Config.m_SeqRange = TSeqRange::GetEmpty();
274    	m_Config.m_FiltAlgoId = -1;
275    	if(!format.empty()) {
276    		vector<string> tmp;
277    		NStr::Split(format, " \t", tmp, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
278    		for(unsigned int i=0; i < tmp.size(); i++) {
279    			if(tmp[i].find('-')!= string::npos) {
280    				try {
281    					m_Config.m_SeqRange = ParseSequenceRangeOpenEnd(tmp[i]);
282    				} catch (...) {
283    				}
284    			}
285    			else if (!m_DbIsProtein && NStr::EqualNocase(tmp[i].c_str(), "minus")) {
286    				m_Config.m_Strand = eNa_strand_minus;
287    			}
288    			else {
289    				m_Config.m_FiltAlgoId = NStr::StringToNonNegativeInt(tmp[i]);
290    				if(!s_IsMaskAlgoIdValid(*m_BlastDb, m_Config.m_FiltAlgoId)){
291    					status = 1;
292    				}
293    			}
294    		}
295    	}
296    	return status;
297 }
298 
299 int
x_ProcessTaxIdList(CBlastDB_Formatter & fmt)300 CBlastDBCmdApp::x_ProcessTaxIdList(CBlastDB_Formatter & fmt)
301 {
302     vector<blastdb::TOid> oids;
303     m_BlastDb->TaxIdsToOids(m_TaxIdList, oids);
304     if(oids.size() == 0) {
305 		ERR_POST (Error << "No seq found in db for taxonomy list");
306 		return 1;
307     }
308     for(unsigned i=0; i < oids.size(); i++) {
309     	fmt.Write(oids[i], m_Config);
310     }
311     return  0;
312 }
313 
314 
315 void
x_InitBlastDB_TaxIdList()316 CBlastDBCmdApp::x_InitBlastDB_TaxIdList()
317 {
318    	const CArgs& args = GetArgs();
319     vector<string> ids;
320     if(args[kArgTaxIdList].HasValue()) {
321     	string input = args[kArgTaxIdList].AsString();
322     	NStr::Split(input, ",", ids);
323     }
324     else {
325     	CNcbiIstream& input = args[kArgTaxIdListFile].AsInputFile();
326     	while (input) {
327     		string line;
328     	    NcbiGetlineEOL(input, line);
329     	    if ( !line.empty() ) {
330     	       	ids.push_back(line);
331     	    }
332     	}
333     }
334     for(unsigned int i=0; i < ids.size(); i++) {
335     	m_TaxIdList.insert(NStr::StringToNumeric<TTaxId>(ids[i], NStr::fAllowLeadingSpaces | NStr::fAllowTrailingSpaces));
336     }
337 
338     CSeqDB::ESeqType seqtype = ParseMoleculeTypeString(args[kArgDbType].AsString());
339     m_DbIsProtein = static_cast<bool>(seqtype == CSeqDB::eProtein);
340     m_TargetOnly = args["target_only"];
341    	if(m_TargetOnly) {
342     	CRef<CSeqDBGiList> taxid_list(new CSeqDBGiList());
343     	taxid_list->AddTaxIds(m_TaxIdList);
344    		m_BlastDb.Reset(new CSeqDBExpert(args[kArgDb].AsString(), seqtype, taxid_list.GetPointer()));
345     }
346    	else {
347    		m_BlastDb.Reset(new CSeqDBExpert(args[kArgDb].AsString(), seqtype));
348    	}
349 }
350 
351 
352 int
x_ProcessBatchEntry_NoDup(CBlastDB_Formatter & fmt)353 CBlastDBCmdApp::x_ProcessBatchEntry_NoDup(CBlastDB_Formatter & fmt)
354 {
355 	int err_found = 0;
356    	const CArgs& args = GetArgs();
357     CNcbiIstream& input = args["entry_batch"].AsInputFile();
358     vector<string> ids, formats;
359     vector<CSeqDB::TOID> oids;
360     while (input) {
361         string line;
362         NcbiGetlineEOL(input, line);
363         if ( !line.empty() ) {
364         	string id, format;
365         	NStr::SplitInTwo(line, " \t", id, format, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
366         	if(id.empty()) {
367         		continue;
368         	}
369         	ids.push_back(id);
370         	formats.push_back(format);
371         }
372     }
373 
374     if(m_BlastDb->GetBlastDbVersion() == EBlastDbVersion::eBDB_Version5) {
375     	for(unsigned int i=0; i < ids.size(); i++) {
376     		ids[i] = s_PreProcessAccessionsForDBv5(ids[i]);
377     	}
378     }
379     try {
380     m_BlastDb->AccessionsToOids(ids, oids);
381     }
382     catch (CSeqDBException & e) {
383     	if (e.GetMsg().find("DB contains no accession info") == NPOS){
384     		NCBI_RETHROW_SAME(e, e.GetMsg());
385     	}
386     }
387     for(unsigned i=0; i < ids.size(); i++) {
388     	if(oids[i] == kSeqDBEntryNotFound) {
389     		TGi num_id = NStr::StringToNumeric<TGi>(ids[i], NStr::fConvErr_NoThrow);
390     		if(!errno) {
391     			int gi_oid = -1;
392     			m_BlastDb->GiToOidwFilterCheck(num_id, gi_oid);
393     			if(gi_oid >= 0) {
394     				oids[i] = gi_oid;
395     			}
396     		}
397     		if(oids[i] == kSeqDBEntryNotFound) {
398     			err_found ++;
399     			ERR_POST (Error << "Skipped " << ids[i]);
400     			continue;
401     		}
402     	}
403     	if(x_ModifyConfigForBatchEntry(formats[i]))  {
404     		err_found ++;
405     		ERR_POST (Error << "Skipped " << ids[i]);
406     		continue;
407     	}
408     	if(m_TargetOnly) {
409     		fmt.Write(oids[i], m_Config, ids[i]);
410     	}
411     	else {
412     	   	fmt.Write(oids[i], m_Config);
413     	}
414     }
415     return (err_found) ? 1 : 0;
416 }
417 
418 int
x_ProcessBatchEntry(CBlastDB_Formatter & fmt)419 CBlastDBCmdApp::x_ProcessBatchEntry(CBlastDB_Formatter & fmt)
420 {
421 	int err_found = 0;
422    	const CArgs& args = GetArgs();
423     CNcbiIstream& input = args["entry_batch"].AsInputFile();
424 
425     while (input) {
426         string line;
427         NcbiGetlineEOL(input, line);
428         if ( !line.empty() ) {
429         	string id, format;
430         	NStr::SplitInTwo(line, " \t", id, format, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
431         	if(id.empty()) {
432         		continue;
433         	}
434         	if(x_ModifyConfigForBatchEntry(format))  {
435         		err_found ++;
436         		ERR_POST (Error << "Skipped " << id);
437         		continue;
438         	}
439    			vector<int> oids;
440          	if(!x_GetOids(id, oids)) {
441          		err_found ++;
442         		ERR_POST (Error << "Skipped " << id);
443         		continue;
444          	}
445 
446          	if (m_GetDuplicates) {
447          		for(unsigned int j=0; j < oids.size(); j++) {
448          			fmt.Write(oids[j], m_Config);
449          		}
450            	}
451          	else {
452          		if(m_TargetOnly) {
453          			fmt.Write(oids[0], m_Config, id);
454          		}
455          		else {
456          			fmt.Write(oids[0], m_Config);
457          		}
458          	}
459         }
460     }
461     return (err_found) ? 1 : 0;
462 }
463 
464 
465 int
x_ProcessBatchPig(CBlastDB_Formatter & fmt)466 CBlastDBCmdApp::x_ProcessBatchPig(CBlastDB_Formatter & fmt)
467 {
468 	int err_found = 0;
469    	const CArgs& args = GetArgs();
470     CNcbiIstream& input = args["ipg_batch"].AsInputFile();
471 
472     while (input) {
473         string line;
474         NcbiGetlineEOL(input, line);
475         if ( !line.empty() ) {
476         	string id, format;
477         	NStr::SplitInTwo(line, " \t", id, format, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
478         	if(id.empty()) {
479         		continue;
480         	}
481         	if(x_ModifyConfigForBatchEntry(format))  {
482         		err_found ++;
483         		ERR_POST (Error << "Skipped IPG : " << id);
484         		continue;
485         	}
486    			int oid;
487    			int pig = NStr::StringToInt(id, NStr::fConvErr_NoThrow );
488    			m_BlastDb->PigToOid(pig,oid);
489    			if (oid == -1) {
490          		err_found ++;
491         		ERR_POST (Error << "Skipped IPG: " << id);
492         		continue;
493          	}
494 
495    			fmt.Write(oid, m_Config);
496         }
497     }
498     return (err_found) ? 1 : 0;
499 }
500 
501 void
x_InitBlastDB()502 CBlastDBCmdApp::x_InitBlastDB()
503 {
504     const CArgs& args = GetArgs();
505 
506     CSeqDB::ESeqType seqtype = ParseMoleculeTypeString(args[kArgDbType].AsString());
507     m_BlastDb.Reset(new CSeqDBExpert(args[kArgDb].AsString(), seqtype));
508     m_DbIsProtein = static_cast<bool>(m_BlastDb->GetSequenceType() == CSeqDB::eProtein);
509 }
510 
511 void
x_PrintBlastDatabaseInformation()512 CBlastDBCmdApp::x_PrintBlastDatabaseInformation()
513 {
514     _ASSERT(m_BlastDb.NotEmpty());
515     static const NStr::TNumToStringFlags kFlags = NStr::fWithCommas;
516     const string kLetters = m_DbIsProtein ? "residues" : "bases";
517     const string kVersion = (m_BlastDb->GetBlastDbVersion() == EBlastDbVersion::eBDB_Version5) ? "5":"4";
518     const CArgs& args = GetArgs();
519 
520     CNcbiOstream& out = args[kArgOutput].AsOutputFile();
521 
522     // Print basic database information
523     out << "Database: " << m_BlastDb->GetTitle() << endl
524         << "\t" << NStr::IntToString(m_BlastDb->GetNumSeqs(), kFlags)
525         << " sequences; ";
526         if(args["exact_length"])
527         	out << NStr::UInt8ToString(m_BlastDb->GetExactTotalLength(), kFlags);
528         else
529         	out << NStr::UInt8ToString(m_BlastDb->GetTotalLength(), kFlags);
530     out << " total " << kLetters << endl << endl
531         << "Date: " << m_BlastDb->GetDate()
532         << "\tLongest sequence: "
533         << NStr::IntToString(m_BlastDb->GetMaxLength(), kFlags) << " "
534         << kLetters << endl << endl;
535 
536     out << "BLASTDB Version: " << kVersion << endl;
537 
538 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION  > 550)) && \
539      (!defined(NCBI_COMPILER_MIPSPRO)) )
540     // Print filtering algorithms supported
541     out << m_BlastDb->GetAvailableMaskAlgorithmDescriptions();
542 #endif
543 
544     // Print volume names
545     vector<string> volumes;
546     m_BlastDb->FindVolumePaths(volumes,false);
547     out << endl << "Volumes:" << endl;
548     ITERATE(vector<string>, file_name, volumes) {
549         out << "\t" << *file_name << endl;
550     }
551 }
552 
553 class CPrintTaxFields
554 {
555 private:
556 	enum {
557 		eTaxID,
558 		eSciName,
559 		eCommonName,
560 		eSuperKingdom,
561 		eBlastName,
562 		eMaxFields
563 	};
564 	CNcbiOstream & m_Out;
565 	vector<int> m_Fields;
566 	vector<string> m_Seperators;
567 	bool m_NeedTaxInfoLookup;
568 public:
CPrintTaxFields(CNcbiOstream & out,const string & fmt)569 	CPrintTaxFields(CNcbiOstream & out, const string & fmt): m_Out(out), m_NeedTaxInfoLookup(true) {
570 		vector<string> fields;
571         string sp = kEmptyStr;
572 		if(fmt == "%f") {
573 			m_Seperators.push_back(sp);
574 			for(unsigned int i=eTaxID; i < eMaxFields; i++){
575 				m_Fields.push_back(i);
576 				m_Seperators.push_back("\t");
577 			}
578 			return;
579 		}
580 
581 	    for (unsigned int i = 0; i < fmt.size(); i++) {
582 	    	if (fmt[i] == '%') {
583 		        if (fmt[i+1] == '%') {
584 		            sp += fmt[i];
585 		            continue;
586 		        }
587 		        i++;
588 		        switch (fmt[i]) {
589 		        case 'T' :
590        				m_Fields.push_back(eTaxID);
591        			break;
592 		        case 'S' :
593 		        	m_Fields.push_back(eSciName);
594                 break;
595 		        case 'L' :
596 		        	m_Fields.push_back(eCommonName);
597 		        break;
598 		        case 'K' :
599 		        	m_Fields.push_back(eSuperKingdom);
600                 break;
601 		        case 'B' :
602 		        	m_Fields.push_back(eBlastName);
603 		        break;
604 		        default:
605 	                sp += fmt[i-1];
606 	                sp += fmt[i];
607 	                continue;
608 	            break;
609 		        }
610 	            m_Seperators.push_back(sp);
611 	            sp = kEmptyStr;
612 	    	}
613 	    	else {
614 		        sp += fmt[i];
615 	    	}
616 	    }
617 		m_Seperators.push_back(sp);
618 
619 		if(m_Fields.empty()) {
620 			NCBI_THROW(CInputException, eInvalidInput,
621 				       "Invalid format options for tax_info.");
622 		}
623 		if((m_Fields.size() == 1) && (m_Fields[0] == eTaxID)){
624 			m_NeedTaxInfoLookup = false;
625 		}
626 	}
627 
PrintEntry(const SSeqDBTaxInfo & t)628 	void PrintEntry(const SSeqDBTaxInfo & t){
629 		for(unsigned int i=0; i < m_Fields.size(); i++) {
630 			m_Out << m_Seperators[i];
631 			switch (m_Fields[i]){
632 				case eTaxID:
633 					m_Out << t.taxid;
634 				break;
635 				case eSciName:
636 					m_Out << t.scientific_name;
637 				break;
638 				case eCommonName:
639 					m_Out << t.common_name;
640 				break;
641 				case eSuperKingdom:
642 					m_Out << t.s_kingdom;
643 				break;
644 				case eBlastName:
645 					m_Out << t.blast_name;
646 				break;
647 				default:
648 					NCBI_THROW(CInputException, eInvalidInput,
649 					           "Invalid format options for tax_info.");
650 				break;
651 			}
652 		}
653 		m_Out << m_Seperators.back();
654 		m_Out << "\n";
655 	}
NeedTaxNames()656 	bool NeedTaxNames(){return m_NeedTaxInfoLookup;}
657 };
658 
659 
660 void
x_PrintBlastDatabaseTaxInformation()661 CBlastDBCmdApp::x_PrintBlastDatabaseTaxInformation()
662 {
663     _ASSERT(m_BlastDb.NotEmpty());
664     const CArgs& args = GetArgs();
665 
666     CNcbiOstream& out = args[kArgOutput].AsOutputFile();
667     const string& kFmt = args["outfmt"].AsString();
668     CPrintTaxFields tf(out, kFmt);
669     set<TTaxId> tax_ids;
670     m_BlastDb->GetDBTaxIds(tax_ids);
671     // Print basic database information
672     out << "# of Tax IDs in Database: " << tax_ids.size() << endl;
673 	SSeqDBTaxInfo info;
674     ITERATE(set<TTaxId>, itr, tax_ids) {
675     	SSeqDBTaxInfo info;
676     	if(tf.NeedTaxNames()){
677     		CSeqDBTaxInfo::GetTaxNames(*itr, info);
678     		if(info.taxid == ZERO_TAX_ID){
679     			info.taxid = *itr;
680    				info.scientific_name = NA;
681    				info.common_name = NA;
682    				info.blast_name = NA;
683    				info.s_kingdom = NA;
684    			}
685     	}
686     	else {
687    			info.taxid = *itr;
688     	}
689    		tf.PrintEntry(info);
690     }
691 }
692 
693 
694 string
x_InitSearchRequest()695 CBlastDBCmdApp::x_InitSearchRequest()
696 {
697    	const CArgs& args = GetArgs();
698     m_GetDuplicates = args["get_dups"];
699     m_TargetOnly = args["target_only"];
700 
701     string outfmt = kEmptyStr;
702     if (args["outfmt"].HasValue()) {
703     	outfmt = args["outfmt"].AsString();
704         m_FASTA = false;
705         m_Asn1Bioseq = false;
706 
707         if ((outfmt.find("%f") != string::npos &&
708            	(outfmt.find("%b") != string::npos || outfmt.find("%d") != string::npos)) ||
709             (outfmt.find("%b") != string::npos && outfmt.find("%d") != string::npos)) {
710            	NCBI_THROW(CInputException, eInvalidInput,
711                     	"The %f, %b, %d output format options cannot be specified together.");
712         }
713 
714         if (outfmt.find("%b") != string::npos) {
715            	outfmt = "%b";
716            	m_Asn1Bioseq = true;
717         }
718 
719         // If "%f" is found within outfmt, discard everything else
720         if (outfmt.find("%f") != string::npos) {
721            	outfmt = "%f";
722            	m_FASTA = true;
723         }
724 
725         if (outfmt.find("%d") != string::npos) {
726            	outfmt = "%d";
727         }
728 
729         if (outfmt.find("%m") != string::npos) {
730            	int algo_id = 0;
731            	size_t i = outfmt.find("%m") + 2;
732            	bool found = false;
733            	while (i < outfmt.size() && outfmt[i] >= '0' && outfmt[i] <= '9') {
734            		algo_id = algo_id * 10 + (outfmt[i] - '0');
735                	outfmt.erase(i, 1);
736                	found = true;
737             }
738             if (!found) {
739                	NCBI_THROW(CInputException, eInvalidInput,
740                        	   "The option '-outfmt %m' is not followed by a masking algo ID.");
741             }
742             m_Config.m_FmtAlgoId = algo_id;
743     		if(!s_IsMaskAlgoIdValid(*m_BlastDb, m_Config.m_FmtAlgoId)) {
744     			NCBI_THROW(CInvalidDataException, eInvalidInput,
745     				                   "Invalid filtering algorithm ID for outfmt %m.");
746     		}
747         }
748     }
749 
750     if (args["strand"].HasValue() && !m_DbIsProtein) {
751     	if (args["strand"].AsString() == "plus") {
752                 m_Config.m_Strand = eNa_strand_plus;
753             } else if (args["strand"].AsString() == "minus") {
754                 m_Config.m_Strand = eNa_strand_minus;
755             } else {
756             	NCBI_THROW(CInputException, eInvalidInput,
757             	           "Both strands is not supported");
758             }
759     }
760     m_Config.m_UseCtrlA = args["ctrl_a"];
761     if (args["mask_sequence_with"].HasValue()) {
762     	m_Config.m_FiltAlgoId = -1;
763         m_Config.m_FiltAlgoId = NStr::StringToInt(args["mask_sequence_with"].AsString(), NStr::fConvErr_NoThrow);
764         if(errno) {
765         	m_Config.m_FiltAlgoId = m_BlastDb->GetMaskAlgorithmId(args["mask_sequence_with"].AsString());
766         }
767    		if(!s_IsMaskAlgoIdValid(*m_BlastDb, m_Config.m_FiltAlgoId)){
768    			NCBI_THROW(CInvalidDataException, eInvalidInput,
769    		               "Invalid filtering algorithm ID for mask_sequence_with.");
770    		}
771     }
772     if (args["range"].HasValue()) {
773     	m_Config.m_SeqRange = ParseSequenceRangeOpenEnd(args["range"].AsString());
774     }
775      return outfmt;
776 }
777 
778 int
x_ProcessSearchType(CBlastDB_Formatter & fmt)779 CBlastDBCmdApp::x_ProcessSearchType(CBlastDB_Formatter & fmt)
780 {
781    	const CArgs& args = GetArgs();
782 	if (args["entry"].HasValue() && args["entry"].AsString() == "all") {
783 		fmt.DumpAll(m_Config);
784 	}
785 	else if (args["entry_batch"].HasValue()) {
786 		if(m_GetDuplicates) {
787 			return x_ProcessBatchEntry(fmt);
788 		}
789 		else {
790 			return x_ProcessBatchEntry_NoDup(fmt);
791 		}
792 	}
793 	else if (args["entry"].HasValue() || args["ipg"].HasValue()) {
794 		return x_ProcessEntry(fmt);
795 	}
796 	else if (args["ipg_batch"].HasValue()) {
797 		return x_ProcessBatchPig(fmt);
798 	}
799 	else if(args[kArgTaxIdList].HasValue()||
800 			args[kArgTaxIdListFile].HasValue()) {
801 		 return x_ProcessTaxIdList(fmt);
802 	}
803 	else {
804 		NCBI_THROW(CInputException, eInvalidInput,
805 		       	   "Must specify query type: one of 'entry', 'entry_batch', or 'pig'");
806 	}
807 	return 0;
808 }
809 
x_UseLongSeqIds()810 bool CBlastDBCmdApp::x_UseLongSeqIds()
811 {
812 	const CArgs& args = GetArgs();
813 	if (args["long_seqids"].AsBoolean()) {
814 		return true;
815 	}
816 	CNcbiApplication* app = CNcbiApplication::Instance();
817 	if (app) {
818 		 const CNcbiRegistry& registry = app->GetConfig();
819 		 if (registry.Get("BLAST", "LONG_SEQID") == "1") {
820 			 return true;
821 		 }
822 	}
823 	return false;
824 }
825 
826 int
x_ProcessSearchRequest()827 CBlastDBCmdApp::x_ProcessSearchRequest()
828 {
829    	int err_found = 0;
830     try {
831     	const CArgs& args = GetArgs();
832     	CNcbiOstream& out = args[kArgOutput].AsOutputFile();
833     	string outfmt = x_InitSearchRequest();
834     	/* Special case: full db dump when no range and mask data is specified */
835     	if (m_FASTA) {
836     		CBlastDB_FastaFormatter fasta_fmt(*m_BlastDb, out, args["line_length"].AsInteger(), x_UseLongSeqIds());
837     		err_found = x_ProcessSearchType(fasta_fmt);
838     	}
839     	else if (m_Asn1Bioseq) {
840     		CBlastDB_BioseqFormatter bioseq_fmt(*m_BlastDb, out);
841     		err_found = x_ProcessSearchType(bioseq_fmt);
842     	}
843     	else {
844     		CBlastDB_SeqFormatter seq_fmt(outfmt, *m_BlastDb, out);
845     		err_found = x_ProcessSearchType(seq_fmt);
846     	}
847     }
848     catch (const CException& e) {
849     	ERR_POST(Error << e.GetMsg());
850         err_found = 1;
851     } catch (...) {
852         ERR_POST(Error << "Failed to retrieve requested item");
853         err_found = 1;
854     }
855 	return err_found;
856 }
857 
858 
Init()859 void CBlastDBCmdApp::Init()
860 {
861     HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
862 
863     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
864 
865     // Specify USAGE context
866     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
867                   "BLAST database client, version " + CBlastVersion().Print());
868 
869     arg_desc->SetCurrentGroup("BLAST database options");
870     arg_desc->AddDefaultKey(kArgDb, "dbname", "BLAST database name",
871                             CArgDescriptions::eString, "nr");
872 
873     arg_desc->AddDefaultKey(kArgDbType, "molecule_type",
874                             "Molecule type stored in BLAST database",
875                             CArgDescriptions::eString, "guess");
876     arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
877                                         "nucl", "prot", "guess"));
878 
879     arg_desc->SetCurrentGroup("Retrieval options");
880     arg_desc->AddOptionalKey("entry", "sequence_identifier",
881                      "Comma-delimited search string(s) of sequence identifiers"
882                      ":\n\te.g.: 555, AC147927, 'gnl|dbname|tag', or 'all' "
883                      "to select all\n\tsequences in the database",
884                      CArgDescriptions::eString);
885 
886     arg_desc->AddOptionalKey("entry_batch", "input_file",
887                  "Input file for batch processing (Format: one entry per line, seq id \n"
888 		 "followed by optional space-delimited specifier(s) [range|strand|mask_algo_id]",
889                  CArgDescriptions::eInputFile);
890     arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "entry");
891     arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "range");
892     arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "strand");
893     arg_desc->SetDependency("entry_batch", CArgDescriptions::eExcludes, "mask_sequence_with");
894 
895     arg_desc->AddOptionalKey("ipg", "IPG", "IPG to retrieve",
896                              CArgDescriptions::eInteger);
897     arg_desc->SetConstraint("ipg", new CArgAllowValuesGreaterThanOrEqual(0));
898     arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "entry");
899     arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "entry_batch");
900     arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "target_only");
901     arg_desc->SetDependency("ipg", CArgDescriptions::eExcludes, "ipg_batch");
902 
903     arg_desc->AddOptionalKey("ipg_batch", "input_file",
904                      "Input file for batch processing (Format: one entry per line, IPG \n"
905     		         "followed by optional space-delimited specifier(s) [range|strand|mask_algo_id]",
906                      CArgDescriptions::eInputFile);
907         arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "entry");
908         arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "entry_batch");
909         arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "range");
910         arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "strand");
911         arg_desc->SetDependency("ipg_batch", CArgDescriptions::eExcludes, "mask_sequence_with");
912 
913     arg_desc->AddOptionalKey(kArgTaxIdList, "taxonomy_ids",
914     						"Comma-delimited taxonomy identifiers", CArgDescriptions::eString);
915     arg_desc->SetDependency(kArgTaxIdList, CArgDescriptions::eExcludes, "entry");
916     arg_desc->SetDependency(kArgTaxIdList, CArgDescriptions::eExcludes, "entry_batch");
917     arg_desc->SetDependency(kArgTaxIdList, CArgDescriptions::eExcludes, "pig");
918 
919     arg_desc->AddOptionalKey(kArgTaxIdListFile, "input_file",
920     						"Input file for taxonomy identifiers", CArgDescriptions::eInputFile);
921     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, "entry");
922     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, "entry_batch");
923     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, "pig");
924     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, kArgTaxIdList);
925 
926     arg_desc->AddFlag("info", "Print BLAST database information", true);
927     // All other options to this program should be here
928     const char* exclusions[]  = { "entry", "entry_batch", "outfmt", "strand",
929         "target_only", "ctrl_a", "get_dups", "pig", "range",
930         "mask_sequence", "list", "remove_redundant_dbs", "recursive",
931         "list_outfmt", kArgTaxIdListFile.c_str(), kArgTaxIdList.c_str()};
932     for (size_t i = 0; i < sizeof(exclusions)/sizeof(*exclusions); i++) {
933         arg_desc->SetDependency("info", CArgDescriptions::eExcludes,
934                                 string(exclusions[i]));
935     }
936 
937     arg_desc->AddFlag("tax_info",
938     		          "Print taxonomic information contained in this BLAST database.\n"
939     		          "Use -outfmt to customize output. Format specifiers supported are:\n"
940     		          "\t\t%T means taxid\n"
941     		          "\t\t%L means common taxonomic name\n"
942     		          "\t\t%S means scientific name\n"
943     		          "\t\t%K means taxonomic super kingdom\n"
944     		          "\t\t%B means BLAST name\n"
945     		          "By default it prints: '%T %S %L %K %B'\n", true);
946     // All other options to this program should be here
947     const char* tax_info_exclusions[]  = { "info", "entry", "entry_batch", "strand",
948         "target_only", "ctrl_a", "get_dups", "pig", "range",
949         "mask_sequence", "list", "remove_redundant_dbs", "recursive",
950         "list_outfmt", kArgTaxIdListFile.c_str(), kArgTaxIdList.c_str() };
951     for (size_t i = 0; i < sizeof(tax_info_exclusions)/sizeof(*tax_info_exclusions); i++) {
952         arg_desc->SetDependency("tax_info", CArgDescriptions::eExcludes,
953                                 string(tax_info_exclusions[i]));
954     }
955 
956     arg_desc->SetCurrentGroup("Sequence retrieval configuration options");
957     arg_desc->AddOptionalKey("range", "numbers",
958                          "Range of sequence to extract in 1-based offsets "
959                          "(Format: start-stop, for start to end of sequence use start - )",
960                          CArgDescriptions::eString);
961 
962     arg_desc->AddDefaultKey("strand", "strand",
963                             "Strand of nucleotide sequence to extract",
964                             CArgDescriptions::eString, "plus");
965     arg_desc->SetConstraint("strand", &(*new CArgAllow_Strings, "minus",
966                                         "plus"));
967 
968     arg_desc->AddOptionalKey("mask_sequence_with", "mask_algo_id",
969                              "Produce lower-case masked FASTA using the "
970                              "algorithm ID specified",
971                              CArgDescriptions::eString);
972 
973     arg_desc->SetCurrentGroup("Output configuration options");
974     arg_desc->AddDefaultKey(kArgOutput, "output_file", "Output file name",
975                             CArgDescriptions::eOutputFile, "-");
976 
977     // The format specifiers below should be handled in
978     // CSeqFormatter::x_Builder
979     arg_desc->AddDefaultKey("outfmt", "format",
980             "Output format, where the available format specifiers are:\n"
981             "\t\t%f means sequence in FASTA format\n"
982             "\t\t%s means sequence data (without defline)\n"
983             "\t\t%a means accession\n"
984             "\t\t%g means gi\n"
985             "\t\t%o means ordinal id (OID)\n"
986             "\t\t%i means sequence id\n"
987             "\t\t%t means sequence title\n"
988             "\t\t%l means sequence length\n"
989             "\t\t%h means sequence hash value\n"
990             "\t\t%T means taxid\n"
991             "\t\t%X means leaf-node taxids\n"
992             "\t\t%e means membership integer\n"
993             "\t\t%L means common taxonomic name\n"
994             "\t\t%C means common taxonomic names for leaf-node taxids\n"
995             "\t\t%S means scientific name\n"
996             "\t\t%N means scientific names for leaf-node taxids\n"
997             "\t\t%B means BLAST name\n"     /* Is this useful outside NCBI? */
998 #if _DEBUG
999             "\t\t%n means a list of links integers separated by ';'\n"
1000 #endif /* _DEBUG */
1001             "\t\t%K means taxonomic super kingdom\n"
1002             "\t\t%P means PIG\n"
1003 #if _DEBUG
1004             "\t\t%d means defline in text ASN.1 format\n"
1005             "\t\t%b means Bioseq in text ASN.1 format\n"
1006 #endif /* _DEBUG */
1007             "\t\t%m means sequence masking data.\n"
1008             "\t\t   Masking data will be displayed as a series of 'N-M' values\n"
1009             "\t\t   separated by ';' or the word 'none' if none are available.\n"
1010 #if _DEBUG
1011             "\tIf '%f' or '%d' are specified, all other format specifiers are ignored.\n"
1012             "\tFor every format except '%f' and '%d', each line of output will "
1013 #else
1014             "\tIf '%f' is specified, all other format specifiers are ignored.\n"
1015             "\tFor every format except '%f', each line of output will "
1016 #endif /* _DEBUG */
1017             "correspond\n\tto a sequence.\n",
1018             CArgDescriptions::eString, "%f");
1019 
1020     //arg_desc->AddDefaultKey("target_only", "value",
1021     //                        "Definition line should contain target gi only",
1022     //                        CArgDescriptions::eBoolean, "false");
1023     arg_desc->AddFlag("target_only",
1024                       "Definition line should contain target entry only", true);
1025 
1026     //arg_desc->AddDefaultKey("get_dups", "value",
1027     //                        "Retrieve duplicate accessions",
1028     //                        CArgDescriptions::eBoolean, "false");
1029     arg_desc->AddFlag("get_dups", "Retrieve duplicate accessions", true);
1030     arg_desc->SetDependency("get_dups", CArgDescriptions::eExcludes,
1031                             "target_only");
1032 
1033     arg_desc->SetCurrentGroup("Output configuration options for FASTA format");
1034     arg_desc->AddDefaultKey("line_length", "number", "Line length for output",
1035                         CArgDescriptions::eInteger,
1036                         NStr::IntToString(80));
1037     arg_desc->SetConstraint("line_length",
1038                             new CArgAllowValuesGreaterThanOrEqual(1));
1039 
1040     arg_desc->AddFlag("ctrl_a",
1041                       "Use Ctrl-A as the non-redundant defline separator",true);
1042 
1043     const char* exclusions_discovery[]  = { "entry", "entry_batch", "outfmt",
1044         "strand", "target_only", "ctrl_a", "get_dups", "pig", "range", kArgDb.c_str(),
1045         "info", "mask_sequence", "line_length" };
1046     arg_desc->SetCurrentGroup("BLAST database configuration and discovery options");
1047     arg_desc->AddFlag("show_blastdb_search_path",
1048                       "Displays the default BLAST database search paths", true);
1049     arg_desc->AddOptionalKey("list", "directory",
1050                              "List BLAST databases in the specified directory",
1051                              CArgDescriptions::eString);
1052     arg_desc->AddFlag("remove_redundant_dbs",
1053                       "Remove the databases that are referenced by another "
1054                       "alias file in the directory in question", true);
1055     arg_desc->AddFlag("recursive",
1056                       "Recursively traverse the directory structure to list "
1057                       "available BLAST databases", true);
1058     arg_desc->AddDefaultKey("list_outfmt", "format",
1059             "Output format for the list option, where the available format specifiers are:\n"
1060             "\t\t%f means the BLAST database absolute file name path\n"
1061             "\t\t%p means the BLAST database molecule type\n"
1062             "\t\t%t means the BLAST database title\n"
1063             "\t\t%d means the date of last update of the BLAST database\n"
1064             "\t\t%l means the number of bases/residues in the BLAST database\n"
1065             "\t\t%n means the number of sequences in the BLAST database\n"
1066             "\t\t%U means the number of bytes used by the BLAST database\n"
1067             "\t\t%v means the BLAST database format version\n"
1068             "\tFor every format each line of output will "
1069             "correspond to a BLAST database.\n",
1070             CArgDescriptions::eString, "%f %p");
1071     for (size_t i = 0; i <
1072          sizeof(exclusions_discovery)/sizeof(*exclusions_discovery); i++) {
1073         arg_desc->SetDependency("list", CArgDescriptions::eExcludes,
1074                                 string(exclusions_discovery[i]));
1075         arg_desc->SetDependency("recursive", CArgDescriptions::eExcludes,
1076                                 string(exclusions_discovery[i]));
1077         arg_desc->SetDependency("remove_redundant_dbs", CArgDescriptions::eExcludes,
1078                                 string(exclusions_discovery[i]));
1079         arg_desc->SetDependency("list_outfmt", CArgDescriptions::eExcludes,
1080                                 string(exclusions_discovery[i]));
1081         arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1082                                 string(exclusions_discovery[i]));
1083     }
1084     arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1085                             "list");
1086     arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1087                             "recursive");
1088     arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1089                             "list_outfmt");
1090     arg_desc->SetDependency("show_blastdb_search_path", CArgDescriptions::eExcludes,
1091                             "remove_redundant_dbs");
1092 
1093     arg_desc->AddFlag("exact_length", "Get exact length for db info", true);
1094     arg_desc->SetDependency("exact_length", CArgDescriptions::eRequires,
1095                             "info");
1096     arg_desc->AddFlag("long_seqids", "Use long seq id for fasta deflines", true);
1097     arg_desc->SetDependency("long_seqids", CArgDescriptions::eExcludes, "info");
1098     SetupArgDescriptions(arg_desc.release());
1099 }
1100 
Run(void)1101 int CBlastDBCmdApp::Run(void)
1102 {
1103     int status = 0;
1104     const CArgs& args = GetArgs();
1105 
1106     // Silences warning in CSeq_id for CSeq_id::fParse_PartialOK
1107     SetDiagFilter(eDiagFilter_Post, "!(1306.10)");
1108     SetDiagPostLevel(eDiag_Warning);
1109     SetDiagPostPrefix("blastdbcmd");
1110 
1111     try {
1112         CNcbiOstream& out = args["out"].AsOutputFile();
1113         if (args["show_blastdb_search_path"]) {
1114             out << CSeqDB::GenerateSearchPath() << NcbiEndl;
1115             return status;
1116         } else if (args["list"]) {
1117             const string& blastdb_dir = args["list"].AsString();
1118             const bool recurse = args["recursive"];
1119             const bool remove_redundant_dbs = args["remove_redundant_dbs"];
1120             const string dbtype = args[kArgDbType]
1121                 ? args[kArgDbType].AsString()
1122                 : "guess";
1123             const string& kOutFmt = args["list_outfmt"].AsString();
1124             const vector<SSeqDBInitInfo> dbs =
1125                 FindBlastDBs(blastdb_dir, dbtype, recurse, true,
1126                              remove_redundant_dbs);
1127             CBlastDbFormatter blastdb_fmt(kOutFmt);
1128             ITERATE(vector<SSeqDBInitInfo>, db, dbs) {
1129                 out << blastdb_fmt.Write(*db) << NcbiEndl;
1130             }
1131             return status;
1132         }
1133 
1134         if (args["info"]) {
1135         	x_InitBlastDB();
1136             x_PrintBlastDatabaseInformation();
1137         }
1138         else if (args["tax_info"]) {
1139         	x_InitBlastDB();
1140             x_PrintBlastDatabaseTaxInformation();
1141         }
1142         else if(args[kArgTaxIdList].HasValue() ||
1143                 args[kArgTaxIdListFile].HasValue()) {
1144     		x_InitBlastDB_TaxIdList();
1145        		status = x_ProcessSearchRequest();
1146     	}
1147         else {
1148         	x_InitBlastDB();
1149        		status = x_ProcessSearchRequest();
1150         }
1151     	x_AddCmdOptions();
1152 
1153     } CATCH_ALL(status)
1154 
1155     m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, status);
1156     return status;
1157 }
1158 
x_AddCmdOptions()1159 void CBlastDBCmdApp::x_AddCmdOptions()
1160 {
1161 	const CArgs & args = GetArgs();
1162     if (args["info"]) {
1163     	 m_UsageReport.AddParam(CBlastUsageReport::eDBInfo, true);
1164     }
1165     else if (args["tax_info"]) {
1166     	 m_UsageReport.AddParam(CBlastUsageReport::eDBTaxInfo, true);
1167     }
1168     else if(args[kArgTaxIdList].HasValue() || args[kArgTaxIdListFile].HasValue()) {
1169     	 m_UsageReport.AddParam(CBlastUsageReport::eTaxIdList, true);
1170 	}
1171     else if(args["ipg"].HasValue() || args["ipg_batch"].HasValue()) {
1172     	 m_UsageReport.AddParam(CBlastUsageReport::eIPGList, true);
1173     }
1174     else if(args["entry"].HasValue() || args["entry_batch"].HasValue()) {
1175     	 m_UsageReport.AddParam(CBlastUsageReport::eDBEntry, true);
1176     	 if (args["entry"].HasValue() && args["entry"].AsString() == "all") {
1177     	 	m_UsageReport.AddParam(CBlastUsageReport::eDBDumpAll, true);
1178     	}
1179 		else {
1180     	 	m_UsageReport.AddParam(CBlastUsageReport::eDBEntry, true);
1181 		}
1182     }
1183     if(args["outfmt"].HasValue()) {
1184     	m_UsageReport.AddParam(CBlastUsageReport::eOutputFmt, args["outfmt"].AsString());
1185     }
1186 
1187 
1188 	string db_name = m_BlastDb->GetDBNameList();
1189 	int off = db_name.find_last_of(CFile::GetPathSeparator());
1190     if (off != -1) {
1191     	db_name.erase(0, off+1);
1192 	}
1193 	m_UsageReport.AddParam(CBlastUsageReport::eDBName, db_name);
1194 	m_UsageReport.AddParam(CBlastUsageReport::eDBLength, (Int8) m_BlastDb->GetTotalLength());
1195 	m_UsageReport.AddParam(CBlastUsageReport::eDBNumSeqs, m_BlastDb->GetNumSeqs());
1196 	m_UsageReport.AddParam(CBlastUsageReport::eDBDate, m_BlastDb->GetDate());
1197 }
1198 
1199 
1200 
1201 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])1202 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1203 {
1204     return CBlastDBCmdApp().AppMain(argc, argv);
1205 }
1206 #endif /* SKIP_DOXYGEN_PROCESSING */
1207