1 /*  $Id: blastdb_aliastool.cpp 625561 2021-02-16 19:38:04Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Christiam Camacho
27  *
28  */
29 
30 /** @file blastdb_aliastool.cpp
31  * Command line tool to create BLAST database aliases and associated files.
32  */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <algo/blast/api/version.hpp>
38 #include <objtools/blast/seqdb_reader/seqdbexpert.hpp>
39 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
40 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
41 #include <objtools/blast/seqdb_writer/writedb.hpp>
42 #include <objtools/blast/seqdb_writer/writedb_error.hpp>
43 #include <objtools/blast/seqdb_writer/seqidlist_writer.hpp>
44 #include <objtools/blast/seqdb_reader/seqidlist_reader.hpp>
45 
46 #include <algo/blast/blastinput/blast_input.hpp>
47 #include "../blast/blast_app_util.hpp"
48 
49 #ifndef SKIP_DOXYGEN_PROCESSING
50 USING_NCBI_SCOPE;
51 USING_SCOPE(blast);
52 #endif
53 
54 /// The main application class
55 class CBlastDBAliasApp : public CNcbiApplication
56 {
57 public:
58     /** @inheritDoc */
CBlastDBAliasApp()59     CBlastDBAliasApp() {
60         CRef<CVersion> version(new CVersion());
61         version->SetVersionInfo(new CBlastVersion());
62         SetFullVersion(version);
63         m_StopWatch.Start();
64         if (m_UsageReport.IsEnabled()) {
65         	m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
66         	m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "blastdb_aliastool");
67         }
68     }
~CBlastDBAliasApp()69     ~CBlastDBAliasApp() {
70     	m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
71     }
72 private:
73     /** @inheritDoc */
74     virtual void Init();
75     /** @inheritDoc */
76     virtual int Run();
77 
78     /// Converts gi files from binary to text format
79     /// @param input Input stream with text file [in]
80     /// @param output Output stream where converted binary gi list will be
81     /// written [out]
82     /// @return 0 on success
83     int ConvertGiFile(CNcbiIstream& input, CNcbiOstream& output,
84                       const string* input_fname = NULL,
85                       const string* output_fname = NULL) const;
86     /// Invokes function to create an alias file with the arguments provided on
87     /// the command line
88     void CreateAliasFile() const;
89 
90     int x_ConvertSeqIDFile() const;
91     void x_SeqIDFileInfo() const;
92 
93     void x_AddCmdOptions();
94     /// Documentation for this program
95     static const char * const DOCUMENTATION;
96 
97     /// Describes the modes of operation of this application
98     enum EOperationMode {
99         eCreateAlias,       ///< Create alias files
100         eConvertGiFile,     ///< Convert gi files from text to binary format
101         eConvertSeqIDFile,  ///< Convert text seqidlist files from proprietory binary format
102         eSeqIDFileInfo      ///< Display info about seqidlist file
103     };
104 
105     /// Determine what mode of operation is being used
x_GetOperationMode() const106     EOperationMode x_GetOperationMode() const {
107         EOperationMode retval = eCreateAlias;
108         if (GetArgs()["gi_file_in"].HasValue()) {
109             retval = eConvertGiFile;
110         }
111         if (GetArgs()["seqid_file_in"].HasValue()) {
112             retval = eConvertSeqIDFile;
113         }
114         if (GetArgs()["seqid_file_info"].HasValue()) {
115             retval = eSeqIDFileInfo;
116         }
117         return retval;
118     }
119     vector<string> x_GetDbsToAggregate(const string dbs, const string file) const;
120     void x_AddVDBsToAliasFile( string filename, bool append, string title = kEmptyStr) const;
121 
122     CBlastUsageReport m_UsageReport;
123     CStopWatch m_StopWatch;
124 };
125 
126 const char * const CBlastDBAliasApp::DOCUMENTATION = "\n\n"
127 "This application has three modes of operation:\n\n"
128 "1) GI file conversion:\n"
129 "   Converts a text file containing GIs (one per line) to a more efficient\n"
130 "   binary format. This can be provided as an argument to the -gilist option\n"
131 "   of the BLAST search command line binaries or to the -gilist option of\n"
132 "   this program to create an alias file for a BLAST database (see below).\n\n"
133 "2) Alias file creation (restricting with GI List or Sequence ID List):\n"
134 "   Creates an alias for a BLAST database and a GI or ID list which\n"
135 "   restricts this database. This is useful if one often searches a subset\n"
136 "   of a database (e.g., based on organism or a curated list). The alias\n"
137 "   file makes the search appear as if one were searching a regular BLAST\n"
138 "   database rather than the subset of one.\n\n"
139 "3) Alias file creation (aggregating BLAST databases):\n"
140 "   Creates an alias for multiple BLAST databases. All databases must be of\n"
141 "   the same molecule type (no validation is done). The relevant options are\n"
142 "   -dblist and -num_volumes.\n";
143 
144 static const string kOutput("out");
145 
Init()146 void CBlastDBAliasApp::Init()
147 {
148     HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
149 
150     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
151 
152     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
153                               "Application to create BLAST database aliases, version "
154                               + CBlastVersion().Print() + DOCUMENTATION);
155 
156     string dflt("Default = input file name provided to -gi_file_in argument");
157     dflt += " with the .bgl extension";
158 
159     set<string> exclusions  = {
160         kArgDb, kArgDbType, kArgDbTitle, kArgGiList, kArgSeqIdList, kArgOutput,
161         "dblist", "num_volumes", "vdblist", "seqid_file_in", "seqid_file_out",
162         "seqid_db", "seqid_dbtype", "seqid_file_info"
163     };
164 
165     arg_desc->SetCurrentGroup("GI file conversion options");
166 
167     arg_desc->AddOptionalKey("gi_file_in", "input_file",
168                              "Text file to convert, should contain one GI per line",
169                              CArgDescriptions::eInputFile);
170     for (string exclusion : exclusions) {
171         arg_desc->SetDependency("gi_file_in", CArgDescriptions::eExcludes, exclusion);
172     }
173 
174     arg_desc->AddOptionalKey("gi_file_out", "output_file",
175                              "File name of converted GI file\n" + dflt,
176                              CArgDescriptions::eOutputFile,
177                              CArgDescriptions::fPreOpen | CArgDescriptions::fBinary);
178     arg_desc->SetDependency("gi_file_out", CArgDescriptions::eRequires,
179                             "gi_file_in");
180     for (string exclusion : exclusions) {
181         arg_desc->SetDependency("gi_file_out", CArgDescriptions::eExcludes, exclusion);
182     }
183 
184     arg_desc->SetCurrentGroup("Alias file creation options");
185 
186     arg_desc->AddOptionalKey(kArgDb, "dbname", "BLAST database name",
187                              CArgDescriptions::eString);
188     arg_desc->SetDependency(kArgDb, CArgDescriptions::eRequires, kOutput);
189 
190     arg_desc->AddDefaultKey(kArgDbType, "molecule_type",
191                             "Molecule type stored in BLAST database",
192                             CArgDescriptions::eString, "prot");
193     arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
194                                           "nucl", "prot"));
195 
196     arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
197                              "Title for BLAST database\n"
198                              "Default = name of BLAST database provided to -db"
199                              " argument with the -gifile argument appended to it",
200                              CArgDescriptions::eString);
201     arg_desc->SetDependency(kArgDbTitle, CArgDescriptions::eRequires, kOutput);
202 
203     arg_desc->AddOptionalKey(kArgGiList, "input_file",
204                              "Text or binary gi file to restrict the BLAST "
205                              "database provided in -db argument\n"
206                              "If text format is provided, it will be converted "
207                              "to binary",
208                              CArgDescriptions::eInputFile);
209     arg_desc->SetDependency(kArgGiList, CArgDescriptions::eRequires, kOutput);
210 
211     arg_desc->AddOptionalKey(kArgSeqIdList, "input_file",
212                              "Text sequence id or accession file to restrict "
213                              "the BLAST database provided in -db argument",
214                              CArgDescriptions::eInputFile);
215 
216     arg_desc->SetDependency(kArgSeqIdList, CArgDescriptions::eRequires, kOutput);
217     arg_desc->SetDependency(kArgSeqIdList, CArgDescriptions::eExcludes,
218                             kArgGiList);
219 
220     arg_desc->AddOptionalKey(kArgTaxIdListFile, "input_file",
221                                  "Text taxonomy id file to restrict "
222                                  "the BLAST database provided in -db argument",
223                                  CArgDescriptions::eInputFile);
224 
225     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eRequires, kOutput);
226     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, kArgGiList);
227     arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, kArgSeqIdList);
228 
229 #ifdef NCBI_TI
230     arg_desc->AddFlag("process_as_tis",
231                       "Process all numeric ID lists as TIs instead of GIs", true);
232 #endif
233 
234     arg_desc->AddOptionalKey(kOutput, "database_name",
235                              "Name of BLAST database alias to be created",
236                              CArgDescriptions::eString);
237 
238     arg_desc->AddOptionalKey("dblist", "database_names",
239                              "A space separated list of BLAST database names to"
240                              " aggregate",
241                              CArgDescriptions::eString);
242 
243     arg_desc->AddOptionalKey("dblist_file", "file_name",
244                              "A file containing a list of BLAST database names"
245                              " to aggregate, one per line",
246                              CArgDescriptions::eInputFile);
247 
248     /* For VDBLIST */
249     arg_desc->AddOptionalKey("vdblist", "vdb_names",
250                              "A space separated list of VDB names to aggregate",
251                              CArgDescriptions::eString);
252 
253     arg_desc->AddOptionalKey("vdblist_file", "file_name",
254                              "A file containing a list of vdb names"
255                              " to aggregate, one per line",
256                              CArgDescriptions::eInputFile);
257     const char* key[] = { "dblist", "dblist_file", "vdblist", "vdblist_file" };
258     for (size_t i = 0; i < sizeof(key)/sizeof(*key); i++) {
259         arg_desc->SetDependency(key[i], CArgDescriptions::eExcludes, kArgDb);
260         arg_desc->SetDependency(key[i], CArgDescriptions::eExcludes, "num_volumes");
261         arg_desc->SetDependency(key[i], CArgDescriptions::eRequires, kOutput);
262         arg_desc->SetDependency(key[i], CArgDescriptions::eRequires, kArgDbType);
263         arg_desc->SetDependency(key[i], CArgDescriptions::eRequires, kArgDbTitle);
264     }
265     arg_desc->SetDependency("dblist", CArgDescriptions::eExcludes, "dblist_file");
266     arg_desc->SetDependency("vdblist", CArgDescriptions::eExcludes, "vdblist_file");
267 
268     CNcbiOstrstream msg;
269     msg << "Number of volumes to aggregate, in which case the "
270         << "basename for the database is extracted from the "
271         << kOutput << " option";
272     arg_desc->AddOptionalKey("num_volumes", "positive_integer",
273                              CNcbiOstrstreamToString(msg),
274                              CArgDescriptions::eInteger);
275     arg_desc->SetDependency("num_volumes", CArgDescriptions::eExcludes, kArgDb);
276     arg_desc->SetDependency("num_volumes", CArgDescriptions::eExcludes, kArgGiList);
277     arg_desc->SetDependency("num_volumes", CArgDescriptions::eExcludes, kArgSeqIdList);
278     arg_desc->SetDependency("num_volumes", CArgDescriptions::eRequires, kOutput);
279     arg_desc->SetDependency("num_volumes", CArgDescriptions::eRequires, kArgDbType);
280     arg_desc->SetDependency("num_volumes", CArgDescriptions::eRequires, kArgDbTitle);
281     arg_desc->SetConstraint("num_volumes", new CArgAllowValuesGreaterThanOrEqual(1));
282 
283     string dflt_seqid("Default = input file name provided to -seqid_file_in argument");
284     set<string> seqid_exclusions  = {
285         kArgDb, kArgDbType, kArgDbTitle, kArgGiList, kArgSeqIdList, kArgOutput,
286         "dblist", "num_volumes", "vdblist"
287     };
288     // "gi_file_in" and "gi_file_out" already exclude "seqid_file_in" and
289     // "seqid_file_out".
290 
291     arg_desc->SetCurrentGroup("Seqd ID file conversion options");
292 
293     arg_desc->AddOptionalKey("seqid_file_in", "input_file",
294                              "Text file to convert, should contain one seq id per line",
295                              CArgDescriptions::eInputFile);
296     for (string exclusion : seqid_exclusions) {
297         arg_desc->SetDependency("seqid_file_in", CArgDescriptions::eExcludes, exclusion);
298     }
299 
300     arg_desc->AddOptionalKey("seqid_title", "seqid_title", "Title for seqid list.\n " +
301                              dflt_seqid, CArgDescriptions::eString);
302     arg_desc->SetDependency("seqid_title", CArgDescriptions::eRequires, "seqid_file_in");
303 
304     arg_desc->AddOptionalKey("seqid_file_out", "output_file",
305                              "File name of converted seq id file\n" + dflt_seqid + " with the .bsl extension",
306                              CArgDescriptions::eString);
307 
308     arg_desc->AddOptionalKey("seqid_db", "dbname", "BLAST database for seqidlist",
309                              CArgDescriptions::eString);
310     arg_desc->SetDependency("seqid_db", CArgDescriptions::eRequires, "seqid_file_in");
311 
312     arg_desc->AddOptionalKey("seqid_dbtype", "molecule_type", "Molecule type BLAST database",
313                              CArgDescriptions::eString);
314     arg_desc->SetDependency("seqid_dbtype", CArgDescriptions::eRequires, "seqid_file_in");
315     arg_desc->SetDependency("seqid_dbtype", CArgDescriptions::eRequires, "seqid_db");
316     arg_desc->SetConstraint("seqid_dbtype", &(*new CArgAllow_Strings, "nucl", "prot"));
317 
318     for (string exclusion : seqid_exclusions) {
319         arg_desc->SetDependency("seqid_file_out", CArgDescriptions::eExcludes, exclusion);
320     }
321 
322     set<string> seqid_info_exclusions = {
323         kArgDb, kArgDbType, kArgDbTitle, kArgGiList, kArgSeqIdList, kArgOutput,
324         "dblist", "num_volumes", "vdblist", "seqid_file_in", "seqid_file_out"
325     };
326     // "gi_file_in" and "gi_file_out" already exclude "seqid_file_info".
327 
328     arg_desc->AddOptionalKey("seqid_file_info", "seqid_file_info", "Display seqidlist file info", CArgDescriptions::eString);
329     for (string exclusion : seqid_info_exclusions) {
330         arg_desc->SetDependency("seqid_file_info", CArgDescriptions::eExcludes, exclusion);
331     }
332 
333 
334     SetupArgDescriptions(arg_desc.release());
335 }
336 
337 int
ConvertGiFile(CNcbiIstream & input,CNcbiOstream & output,const string * input_fname,const string * output_fname) const338 CBlastDBAliasApp::ConvertGiFile(CNcbiIstream& input,
339                                 CNcbiOstream& output,
340                                 const string* input_fname /* = NULL */,
341                                 const string* output_fname /* = NULL */) const
342 {
343     const CArgs& args = GetArgs();
344     CBinaryListBuilder::EIdType type = CBinaryListBuilder::eGi;
345     string product("GI");
346     if (args.Exist("process_as_tis") && args["process_as_tis"]) {
347         type = CBinaryListBuilder::eTi;
348         product.assign("TI");
349     }
350     CBinaryListBuilder builder(type);
351 
352     unsigned int line_ctr = 0;
353     while (input) {
354         string line;
355         NcbiGetlineEOL(input, line);
356         line_ctr++;
357         if ( !line.empty() ) {
358             if (NStr::StartsWith(line, "#")) continue;
359             try { builder.AppendId(NStr::StringToInt8(line)); }
360             catch (const CStringException& e) {
361                 ERR_POST(Warning << "error in line " << line_ctr
362                          << ": " << e.GetMsg());
363             }
364         }
365     }
366 
367     builder.Write(output);
368     if (input_fname && output_fname) {
369         LOG_POST("Converted " << builder.Size() << " " << product << "s from "
370                  << *input_fname << " to binary format in " << *output_fname);
371     } else {
372         LOG_POST("Converted " << builder.Size() << " " << product << "s into "
373                  << "binary " << product << " file");
374     }
375     return 0;
376 }
377 
378 void
CreateAliasFile() const379 CBlastDBAliasApp::CreateAliasFile() const
380 {
381     const CArgs& args = GetArgs();
382     string title;
383     bool isTiList = false;
384     if (args.Exist("process_as_tis") && args["process_as_tis"]) {
385         isTiList = true;
386     }
387 
388     if (args[kArgDb].HasValue() && !args[kArgGiList].HasValue() &&
389         !args[kArgSeqIdList].HasValue()&& ! args[kArgTaxIdListFile].HasValue()) {
390 
391         NCBI_THROW(CInputException, eInvalidInput, "Either gilist or "
392                    "seqid_list must be specified if database name is used");
393     }
394 
395     if (args[kArgDbTitle].HasValue()) {
396         title = args[kArgDbTitle].AsString();
397     } else if (args[kArgDb].HasValue()) {
398         _ASSERT(args[kArgGiList].HasValue() || args[kArgSeqIdList].HasValue() ||
399         		args[kArgTaxIdListFile].HasValue());
400         title = args[kArgDb].AsString() + " limited by ";
401         if (args[kArgGiList]) {
402             title += args[kArgGiList].AsString();
403         }
404         else if (args[kArgSeqIdList]){
405             title += args[kArgSeqIdList].AsString();
406         } else {
407             title += args[kArgTaxIdListFile].AsString();
408         }
409     }
410     const CWriteDB::ESeqType seq_type =
411         args[kArgDbType].AsString() == "prot"
412         ? CWriteDB::eProtein
413         : CWriteDB::eNucleotide;
414 
415     string gilist = args[kArgGiList] ? args[kArgGiList].AsString() : kEmptyStr;
416     if ( !gilist.empty() ) {
417         if ( !CFile(gilist).Exists() ) {
418             NCBI_THROW(CSeqDBException, eFileErr, gilist + " not found");
419         }
420         if ( (!isTiList && !SeqDB_IsBinaryGiList(gilist)) ||
421              (isTiList && !SeqDB_IsBinaryTiList(gilist)) ) {
422             const char mol_type = args[kArgDbType].AsString()[0];
423             _ASSERT(mol_type == 'p' || mol_type == 'n');
424             CNcbiOstrstream oss;
425             oss << args[kOutput].AsString() << "." << mol_type <<
426                 (isTiList ? ".btl" : ".gil");
427             gilist.assign(CNcbiOstrstreamToString(oss));
428             const string& ifname = args[kArgGiList].AsString();
429             ifstream input(ifname.c_str());
430             ofstream output(gilist.c_str(), std::ios::binary);
431             ConvertGiFile(input, output, &ifname, &gilist);
432         }
433     }
434 
435     if (args["dblist"].HasValue()) {
436         //use SeqDBExpert to check if the orginal db exists
437         CSeqDBExpert::ESeqType db_seqtype = seq_type == CWriteDB::eProtein ?
438             CSeqDBExpert::eProtein : CSeqDBExpert::eNucleotide;
439         CSeqDBExpert original_db(args["dblist"].AsString(), db_seqtype);
440     }
441 
442     const EAliasFileFilterType alias_type = (isTiList ? eTiList : eGiList);
443     if (args["dblist"].HasValue() || args["dblist_file"].HasValue()) {
444         vector<string> dbs2aggregate = x_GetDbsToAggregate("dblist", "dblist_file");
445         CWriteDB_CreateAliasFile(args[kOutput].AsString(), dbs2aggregate,
446                                  seq_type, gilist, title, alias_type);
447     } else if (args["num_volumes"].HasValue()) {
448         const unsigned int num_vols =
449             static_cast<unsigned int>(args["num_volumes"].AsInteger());
450         CWriteDB_CreateAliasFile(args[kOutput].AsString(), num_vols, seq_type,
451                                  title);
452     } else if (args[kArgDb].HasValue() && args[kArgGiList]){
453         CWriteDB_CreateAliasFile(args[kOutput].AsString(),
454                                  args[kArgDb].AsString(),
455                                  seq_type, gilist,
456                                  title, alias_type);
457     } else if (args[kArgDb].HasValue() && args[kArgSeqIdList]){
458     	string seqid_list = args[kArgSeqIdList].AsString();
459     	if ( !seqid_list.empty() ) {
460     	    if ( !CFile(seqid_list).Exists() ) {
461     	        NCBI_THROW(CSeqDBException, eFileErr, seqid_list + " not found");
462     	    }
463     	}
464         CWriteDB_CreateAliasFile(args[kOutput].AsString(),
465                                  args[kArgDb].AsString(),
466                                  seq_type, seqid_list,
467                                  title, eSeqIdList);
468     } else if (args[kArgDb].HasValue() && args[kArgTaxIdListFile]) {
469     	string taxid_list = args[kArgTaxIdListFile].AsString();
470     	if ( !taxid_list.empty() ) {
471     	    if ( !CFile(taxid_list).Exists() ) {
472     	        NCBI_THROW(CSeqDBException, eFileErr, taxid_list + " not found");
473     	    }
474     	}
475         CWriteDB_CreateAliasFile(args[kOutput].AsString(),
476                                  args[kArgDb].AsString(),
477                                  seq_type, taxid_list,
478                                  title, eTaxIdList);
479     }
480 
481     if (args["vdblist"].HasValue() || args["vdblist_file"].HasValue()) {
482     	CNcbiOstrstream fname;
483     	if (args["dblist"].HasValue() || args["dblist_file"].HasValue()) {
484     		fname << args[kOutput].AsString() << (seq_type == CWriteDB::eProtein ? ".pal" : ".nal");
485     		x_AddVDBsToAliasFile( CNcbiOstrstreamToString(fname), true );
486     	}
487     	else {
488     		fname << args[kOutput].AsString() << (seq_type == CWriteDB::eProtein ? ".pvl" : ".nvl");
489     		string title;
490     		if(args["title"].HasValue()) {
491     			title = args["title"].AsString();
492     		}
493     		x_AddVDBsToAliasFile( CNcbiOstrstreamToString(fname), false, title);
494     	}
495     }
496 }
497 
x_AddVDBsToAliasFile(string filename,bool append,string title) const498 void CBlastDBAliasApp::x_AddVDBsToAliasFile( string filename, bool append, string title) const
499 {
500     vector<string> vdbs = x_GetDbsToAggregate("vdblist", "vdblist_file");
501     if(vdbs.empty()) {
502     	 LOG_POST(Warning <<"Empty vdb list");
503     	 return;
504     }
505 
506 	IOS_BASE::openmode op_mode = IOS_BASE::out;
507 	if(append) {
508 		op_mode |= IOS_BASE::app;
509 	}
510 	CNcbiOfstream alias_file(filename.c_str(), op_mode);
511 
512 	if(!append) {
513 		alias_file <<  "#\n# Alias file created " << CTime(CTime::eCurrent).AsString() << "\n#\n";
514 	}
515 
516 	if(kEmptyStr != title) {
517 		alias_file << "TITLE " << title << "\n";
518 	}
519 
520 	 alias_file << "VDBLIST ";
521 	 ITERATE(vector< string >, iter, vdbs) {
522 		 alias_file << "\"" << *iter << "\" ";
523 	 }
524 	 alias_file << "\n";
525 }
526 
x_GetDbsToAggregate(const string dbs,const string file) const527 vector<string> CBlastDBAliasApp::x_GetDbsToAggregate(const string dbs, const string file) const
528 {
529     vector<string> retval;
530     const CArgs& args = GetArgs();
531     if (args[dbs].HasValue()) {
532         const string dblist = args[dbs].AsString();
533         NStr::Split(dblist, " ", retval);
534     } else if (args[file].HasValue()) {
535         CNcbiIstream& in(args[file].AsInputFile());
536         string line;
537         while (getline(in, line)) {
538             line = NStr::TruncateSpaces(line);
539             if (line.empty()) {
540                 continue;
541             }
542             retval.push_back(line);
543         }
544     } else {
545         abort();
546     }
547     return retval;
548 }
549 
550 
551 int
x_ConvertSeqIDFile() const552 CBlastDBAliasApp::x_ConvertSeqIDFile() const
553 {
554 	const CArgs& args = GetArgs();
555 	CNcbiIstream& input = args["seqid_file_in"].AsInputFile();
556 	string out_filename = kEmptyStr;
557 	string title = kEmptyStr;
558    	if(args["seqid_file_out"].HasValue()) {
559    		out_filename = args["seqid_file_out"].AsString();
560    	}
561    	else {
562 		out_filename = args["seqid_file_in"].AsString() + ".bsl";
563    	}
564 
565    	if(args["seqid_title"].HasValue()) {
566    		title = args["seqid_title"].AsString();
567    	}
568    	else {
569    		CSeqDB_Path(args["seqid_file_in"].AsString()).FindFileName().GetString(title);
570    	}
571 
572 	CNcbiOfstream output(out_filename.c_str(), IOS_BASE::binary | IOS_BASE::out);
573     unsigned int line_ctr = 0;
574     vector<string> seqid_list;
575     while (input) {
576         string line;
577         NcbiGetlineEOL(input, line);
578         line_ctr++;
579         if ( !line.empty() ) {
580             if (NStr::StartsWith(line, "#")) continue;
581             seqid_list.push_back(line);
582         }
583     }
584 
585     if (args["seqid_db"].HasValue()) {
586     	CSeqDB::ESeqType type = CSeqDB::eUnknown;
587     	if (args["seqid_dbtype"].HasValue()) {
588     		type = (args["seqid_dbtype"].AsString()[0] == 'p') ? CSeqDB::eProtein : CSeqDB::eNucleotide;
589     	}
590     	CSeqDB seqdb(args["seqid_db"].AsString(), type);
591     	return WriteBlastSeqidlistFile(seqid_list, output, title, &seqdb);
592     }
593     else {
594     	return WriteBlastSeqidlistFile(seqid_list, output, title);
595     }
596 
597 }
598 
599 void
x_SeqIDFileInfo() const600 CBlastDBAliasApp::x_SeqIDFileInfo() const
601 {
602 	const CArgs& args = GetArgs();
603 	CBlastSeqidlistFile::PrintSeqidlistInfo(args["seqid_file_info"].AsString(), std::cout);
604 }
605 
606 
Run(void)607 int CBlastDBAliasApp::Run(void)
608 {
609     const CArgs& args = GetArgs();
610     int status = 0;
611 
612     try {
613 
614         if (x_GetOperationMode() == eConvertGiFile) {
615             CNcbiIstream& input = args["gi_file_in"].AsInputFile();
616             string gi_file_out;
617             if (args["gi_file_out"].HasValue()) {
618                 gi_file_out = args["gi_file_out"].AsString();
619             } else {
620                 gi_file_out = args["gi_file_in"].AsString();
621                 gi_file_out += ".bgl";
622             }
623             {
624                 // output will close at end of scope.
625                 CNcbiOfstream output(gi_file_out.c_str(),std::ios::binary);
626                 status = ConvertGiFile(input, output);
627             }
628             if (!CFile(gi_file_out).Exists()) {
629                 NCBI_THROW(CSeqDBException, eFileErr, gi_file_out + " not written");
630             }
631         } else if(x_GetOperationMode() == eConvertSeqIDFile) {
632         	status = x_ConvertSeqIDFile();
633         } else if(x_GetOperationMode() == eSeqIDFileInfo) {
634         	x_SeqIDFileInfo();
635         }
636         else {
637             CreateAliasFile();
638         }
639 
640     } CATCH_ALL(status)
641     x_AddCmdOptions();
642     m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, status);
643     return status;
644 }
645 
x_AddCmdOptions()646 void CBlastDBAliasApp::x_AddCmdOptions()
647 {
648 	const CArgs & args = GetArgs();
649 	 if (args["gi_file_in"].HasValue()) {
650     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "gi_file_conversion");
651 	 }
652 	 else if (args["seqid_file_in"].HasValue()) {
653     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "seqid_file_conversion");
654 	 }
655 	 else if (args["seqid_file_info"].HasValue()) {
656     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "get_seqid_file_info");
657 	 }
658 
659 	 if (args["dblist"].HasValue() || args["dblist_file"].HasValue() || args["num_volumes"].HasValue()) {
660     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_alias_db");
661 	 }
662 	 else if (args[kArgDb].HasValue() && args[kArgGiList]){
663     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_gilist_alias_db");
664 	 }
665 	 else if (args[kArgDb].HasValue() && args[kArgSeqIdList]){
666     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_seqidlist_alias_db");
667 	 }
668 	 else if (args[kArgDb].HasValue() && args[kArgTaxIdListFile]) {
669     	 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_taxidlist_alias_db");
670 	 }
671 
672 	 if (args["vdblist"].HasValue() || args["vdblist_file"].HasValue()) {
673 	   	if (args["dblist"].HasValue() || args["dblist_file"].HasValue()) {
674 	   		m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "add_vdblist");
675 	   	}
676 	   	else {
677 	   		m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_vdb_alias_db");
678 	   	}
679 	 }
680 }
681 
682 
683 
684 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])685 int main(int argc, const char* argv[] /*, const char* envp[]*/)
686 {
687     return CBlastDBAliasApp().AppMain(argc, argv);
688 }
689 #endif /* SKIP_DOXYGEN_PROCESSING */
690