1 /* $Id: blastdb_aliastool.cpp 625561 2021-02-16 19:38:04Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Christiam Camacho
27 *
28 */
29
30 /** @file blastdb_aliastool.cpp
31 * Command line tool to create BLAST database aliases and associated files.
32 */
33
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbistre.hpp>
37 #include <algo/blast/api/version.hpp>
38 #include <objtools/blast/seqdb_reader/seqdbexpert.hpp>
39 #include <objtools/blast/seqdb_reader/seqdbcommon.hpp>
40 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
41 #include <objtools/blast/seqdb_writer/writedb.hpp>
42 #include <objtools/blast/seqdb_writer/writedb_error.hpp>
43 #include <objtools/blast/seqdb_writer/seqidlist_writer.hpp>
44 #include <objtools/blast/seqdb_reader/seqidlist_reader.hpp>
45
46 #include <algo/blast/blastinput/blast_input.hpp>
47 #include "../blast/blast_app_util.hpp"
48
49 #ifndef SKIP_DOXYGEN_PROCESSING
50 USING_NCBI_SCOPE;
51 USING_SCOPE(blast);
52 #endif
53
54 /// The main application class
55 class CBlastDBAliasApp : public CNcbiApplication
56 {
57 public:
58 /** @inheritDoc */
CBlastDBAliasApp()59 CBlastDBAliasApp() {
60 CRef<CVersion> version(new CVersion());
61 version->SetVersionInfo(new CBlastVersion());
62 SetFullVersion(version);
63 m_StopWatch.Start();
64 if (m_UsageReport.IsEnabled()) {
65 m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
66 m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "blastdb_aliastool");
67 }
68 }
~CBlastDBAliasApp()69 ~CBlastDBAliasApp() {
70 m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
71 }
72 private:
73 /** @inheritDoc */
74 virtual void Init();
75 /** @inheritDoc */
76 virtual int Run();
77
78 /// Converts gi files from binary to text format
79 /// @param input Input stream with text file [in]
80 /// @param output Output stream where converted binary gi list will be
81 /// written [out]
82 /// @return 0 on success
83 int ConvertGiFile(CNcbiIstream& input, CNcbiOstream& output,
84 const string* input_fname = NULL,
85 const string* output_fname = NULL) const;
86 /// Invokes function to create an alias file with the arguments provided on
87 /// the command line
88 void CreateAliasFile() const;
89
90 int x_ConvertSeqIDFile() const;
91 void x_SeqIDFileInfo() const;
92
93 void x_AddCmdOptions();
94 /// Documentation for this program
95 static const char * const DOCUMENTATION;
96
97 /// Describes the modes of operation of this application
98 enum EOperationMode {
99 eCreateAlias, ///< Create alias files
100 eConvertGiFile, ///< Convert gi files from text to binary format
101 eConvertSeqIDFile, ///< Convert text seqidlist files from proprietory binary format
102 eSeqIDFileInfo ///< Display info about seqidlist file
103 };
104
105 /// Determine what mode of operation is being used
x_GetOperationMode() const106 EOperationMode x_GetOperationMode() const {
107 EOperationMode retval = eCreateAlias;
108 if (GetArgs()["gi_file_in"].HasValue()) {
109 retval = eConvertGiFile;
110 }
111 if (GetArgs()["seqid_file_in"].HasValue()) {
112 retval = eConvertSeqIDFile;
113 }
114 if (GetArgs()["seqid_file_info"].HasValue()) {
115 retval = eSeqIDFileInfo;
116 }
117 return retval;
118 }
119 vector<string> x_GetDbsToAggregate(const string dbs, const string file) const;
120 void x_AddVDBsToAliasFile( string filename, bool append, string title = kEmptyStr) const;
121
122 CBlastUsageReport m_UsageReport;
123 CStopWatch m_StopWatch;
124 };
125
126 const char * const CBlastDBAliasApp::DOCUMENTATION = "\n\n"
127 "This application has three modes of operation:\n\n"
128 "1) GI file conversion:\n"
129 " Converts a text file containing GIs (one per line) to a more efficient\n"
130 " binary format. This can be provided as an argument to the -gilist option\n"
131 " of the BLAST search command line binaries or to the -gilist option of\n"
132 " this program to create an alias file for a BLAST database (see below).\n\n"
133 "2) Alias file creation (restricting with GI List or Sequence ID List):\n"
134 " Creates an alias for a BLAST database and a GI or ID list which\n"
135 " restricts this database. This is useful if one often searches a subset\n"
136 " of a database (e.g., based on organism or a curated list). The alias\n"
137 " file makes the search appear as if one were searching a regular BLAST\n"
138 " database rather than the subset of one.\n\n"
139 "3) Alias file creation (aggregating BLAST databases):\n"
140 " Creates an alias for multiple BLAST databases. All databases must be of\n"
141 " the same molecule type (no validation is done). The relevant options are\n"
142 " -dblist and -num_volumes.\n";
143
144 static const string kOutput("out");
145
Init()146 void CBlastDBAliasApp::Init()
147 {
148 HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
149
150 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
151
152 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
153 "Application to create BLAST database aliases, version "
154 + CBlastVersion().Print() + DOCUMENTATION);
155
156 string dflt("Default = input file name provided to -gi_file_in argument");
157 dflt += " with the .bgl extension";
158
159 set<string> exclusions = {
160 kArgDb, kArgDbType, kArgDbTitle, kArgGiList, kArgSeqIdList, kArgOutput,
161 "dblist", "num_volumes", "vdblist", "seqid_file_in", "seqid_file_out",
162 "seqid_db", "seqid_dbtype", "seqid_file_info"
163 };
164
165 arg_desc->SetCurrentGroup("GI file conversion options");
166
167 arg_desc->AddOptionalKey("gi_file_in", "input_file",
168 "Text file to convert, should contain one GI per line",
169 CArgDescriptions::eInputFile);
170 for (string exclusion : exclusions) {
171 arg_desc->SetDependency("gi_file_in", CArgDescriptions::eExcludes, exclusion);
172 }
173
174 arg_desc->AddOptionalKey("gi_file_out", "output_file",
175 "File name of converted GI file\n" + dflt,
176 CArgDescriptions::eOutputFile,
177 CArgDescriptions::fPreOpen | CArgDescriptions::fBinary);
178 arg_desc->SetDependency("gi_file_out", CArgDescriptions::eRequires,
179 "gi_file_in");
180 for (string exclusion : exclusions) {
181 arg_desc->SetDependency("gi_file_out", CArgDescriptions::eExcludes, exclusion);
182 }
183
184 arg_desc->SetCurrentGroup("Alias file creation options");
185
186 arg_desc->AddOptionalKey(kArgDb, "dbname", "BLAST database name",
187 CArgDescriptions::eString);
188 arg_desc->SetDependency(kArgDb, CArgDescriptions::eRequires, kOutput);
189
190 arg_desc->AddDefaultKey(kArgDbType, "molecule_type",
191 "Molecule type stored in BLAST database",
192 CArgDescriptions::eString, "prot");
193 arg_desc->SetConstraint(kArgDbType, &(*new CArgAllow_Strings,
194 "nucl", "prot"));
195
196 arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
197 "Title for BLAST database\n"
198 "Default = name of BLAST database provided to -db"
199 " argument with the -gifile argument appended to it",
200 CArgDescriptions::eString);
201 arg_desc->SetDependency(kArgDbTitle, CArgDescriptions::eRequires, kOutput);
202
203 arg_desc->AddOptionalKey(kArgGiList, "input_file",
204 "Text or binary gi file to restrict the BLAST "
205 "database provided in -db argument\n"
206 "If text format is provided, it will be converted "
207 "to binary",
208 CArgDescriptions::eInputFile);
209 arg_desc->SetDependency(kArgGiList, CArgDescriptions::eRequires, kOutput);
210
211 arg_desc->AddOptionalKey(kArgSeqIdList, "input_file",
212 "Text sequence id or accession file to restrict "
213 "the BLAST database provided in -db argument",
214 CArgDescriptions::eInputFile);
215
216 arg_desc->SetDependency(kArgSeqIdList, CArgDescriptions::eRequires, kOutput);
217 arg_desc->SetDependency(kArgSeqIdList, CArgDescriptions::eExcludes,
218 kArgGiList);
219
220 arg_desc->AddOptionalKey(kArgTaxIdListFile, "input_file",
221 "Text taxonomy id file to restrict "
222 "the BLAST database provided in -db argument",
223 CArgDescriptions::eInputFile);
224
225 arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eRequires, kOutput);
226 arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, kArgGiList);
227 arg_desc->SetDependency(kArgTaxIdListFile, CArgDescriptions::eExcludes, kArgSeqIdList);
228
229 #ifdef NCBI_TI
230 arg_desc->AddFlag("process_as_tis",
231 "Process all numeric ID lists as TIs instead of GIs", true);
232 #endif
233
234 arg_desc->AddOptionalKey(kOutput, "database_name",
235 "Name of BLAST database alias to be created",
236 CArgDescriptions::eString);
237
238 arg_desc->AddOptionalKey("dblist", "database_names",
239 "A space separated list of BLAST database names to"
240 " aggregate",
241 CArgDescriptions::eString);
242
243 arg_desc->AddOptionalKey("dblist_file", "file_name",
244 "A file containing a list of BLAST database names"
245 " to aggregate, one per line",
246 CArgDescriptions::eInputFile);
247
248 /* For VDBLIST */
249 arg_desc->AddOptionalKey("vdblist", "vdb_names",
250 "A space separated list of VDB names to aggregate",
251 CArgDescriptions::eString);
252
253 arg_desc->AddOptionalKey("vdblist_file", "file_name",
254 "A file containing a list of vdb names"
255 " to aggregate, one per line",
256 CArgDescriptions::eInputFile);
257 const char* key[] = { "dblist", "dblist_file", "vdblist", "vdblist_file" };
258 for (size_t i = 0; i < sizeof(key)/sizeof(*key); i++) {
259 arg_desc->SetDependency(key[i], CArgDescriptions::eExcludes, kArgDb);
260 arg_desc->SetDependency(key[i], CArgDescriptions::eExcludes, "num_volumes");
261 arg_desc->SetDependency(key[i], CArgDescriptions::eRequires, kOutput);
262 arg_desc->SetDependency(key[i], CArgDescriptions::eRequires, kArgDbType);
263 arg_desc->SetDependency(key[i], CArgDescriptions::eRequires, kArgDbTitle);
264 }
265 arg_desc->SetDependency("dblist", CArgDescriptions::eExcludes, "dblist_file");
266 arg_desc->SetDependency("vdblist", CArgDescriptions::eExcludes, "vdblist_file");
267
268 CNcbiOstrstream msg;
269 msg << "Number of volumes to aggregate, in which case the "
270 << "basename for the database is extracted from the "
271 << kOutput << " option";
272 arg_desc->AddOptionalKey("num_volumes", "positive_integer",
273 CNcbiOstrstreamToString(msg),
274 CArgDescriptions::eInteger);
275 arg_desc->SetDependency("num_volumes", CArgDescriptions::eExcludes, kArgDb);
276 arg_desc->SetDependency("num_volumes", CArgDescriptions::eExcludes, kArgGiList);
277 arg_desc->SetDependency("num_volumes", CArgDescriptions::eExcludes, kArgSeqIdList);
278 arg_desc->SetDependency("num_volumes", CArgDescriptions::eRequires, kOutput);
279 arg_desc->SetDependency("num_volumes", CArgDescriptions::eRequires, kArgDbType);
280 arg_desc->SetDependency("num_volumes", CArgDescriptions::eRequires, kArgDbTitle);
281 arg_desc->SetConstraint("num_volumes", new CArgAllowValuesGreaterThanOrEqual(1));
282
283 string dflt_seqid("Default = input file name provided to -seqid_file_in argument");
284 set<string> seqid_exclusions = {
285 kArgDb, kArgDbType, kArgDbTitle, kArgGiList, kArgSeqIdList, kArgOutput,
286 "dblist", "num_volumes", "vdblist"
287 };
288 // "gi_file_in" and "gi_file_out" already exclude "seqid_file_in" and
289 // "seqid_file_out".
290
291 arg_desc->SetCurrentGroup("Seqd ID file conversion options");
292
293 arg_desc->AddOptionalKey("seqid_file_in", "input_file",
294 "Text file to convert, should contain one seq id per line",
295 CArgDescriptions::eInputFile);
296 for (string exclusion : seqid_exclusions) {
297 arg_desc->SetDependency("seqid_file_in", CArgDescriptions::eExcludes, exclusion);
298 }
299
300 arg_desc->AddOptionalKey("seqid_title", "seqid_title", "Title for seqid list.\n " +
301 dflt_seqid, CArgDescriptions::eString);
302 arg_desc->SetDependency("seqid_title", CArgDescriptions::eRequires, "seqid_file_in");
303
304 arg_desc->AddOptionalKey("seqid_file_out", "output_file",
305 "File name of converted seq id file\n" + dflt_seqid + " with the .bsl extension",
306 CArgDescriptions::eString);
307
308 arg_desc->AddOptionalKey("seqid_db", "dbname", "BLAST database for seqidlist",
309 CArgDescriptions::eString);
310 arg_desc->SetDependency("seqid_db", CArgDescriptions::eRequires, "seqid_file_in");
311
312 arg_desc->AddOptionalKey("seqid_dbtype", "molecule_type", "Molecule type BLAST database",
313 CArgDescriptions::eString);
314 arg_desc->SetDependency("seqid_dbtype", CArgDescriptions::eRequires, "seqid_file_in");
315 arg_desc->SetDependency("seqid_dbtype", CArgDescriptions::eRequires, "seqid_db");
316 arg_desc->SetConstraint("seqid_dbtype", &(*new CArgAllow_Strings, "nucl", "prot"));
317
318 for (string exclusion : seqid_exclusions) {
319 arg_desc->SetDependency("seqid_file_out", CArgDescriptions::eExcludes, exclusion);
320 }
321
322 set<string> seqid_info_exclusions = {
323 kArgDb, kArgDbType, kArgDbTitle, kArgGiList, kArgSeqIdList, kArgOutput,
324 "dblist", "num_volumes", "vdblist", "seqid_file_in", "seqid_file_out"
325 };
326 // "gi_file_in" and "gi_file_out" already exclude "seqid_file_info".
327
328 arg_desc->AddOptionalKey("seqid_file_info", "seqid_file_info", "Display seqidlist file info", CArgDescriptions::eString);
329 for (string exclusion : seqid_info_exclusions) {
330 arg_desc->SetDependency("seqid_file_info", CArgDescriptions::eExcludes, exclusion);
331 }
332
333
334 SetupArgDescriptions(arg_desc.release());
335 }
336
337 int
ConvertGiFile(CNcbiIstream & input,CNcbiOstream & output,const string * input_fname,const string * output_fname) const338 CBlastDBAliasApp::ConvertGiFile(CNcbiIstream& input,
339 CNcbiOstream& output,
340 const string* input_fname /* = NULL */,
341 const string* output_fname /* = NULL */) const
342 {
343 const CArgs& args = GetArgs();
344 CBinaryListBuilder::EIdType type = CBinaryListBuilder::eGi;
345 string product("GI");
346 if (args.Exist("process_as_tis") && args["process_as_tis"]) {
347 type = CBinaryListBuilder::eTi;
348 product.assign("TI");
349 }
350 CBinaryListBuilder builder(type);
351
352 unsigned int line_ctr = 0;
353 while (input) {
354 string line;
355 NcbiGetlineEOL(input, line);
356 line_ctr++;
357 if ( !line.empty() ) {
358 if (NStr::StartsWith(line, "#")) continue;
359 try { builder.AppendId(NStr::StringToInt8(line)); }
360 catch (const CStringException& e) {
361 ERR_POST(Warning << "error in line " << line_ctr
362 << ": " << e.GetMsg());
363 }
364 }
365 }
366
367 builder.Write(output);
368 if (input_fname && output_fname) {
369 LOG_POST("Converted " << builder.Size() << " " << product << "s from "
370 << *input_fname << " to binary format in " << *output_fname);
371 } else {
372 LOG_POST("Converted " << builder.Size() << " " << product << "s into "
373 << "binary " << product << " file");
374 }
375 return 0;
376 }
377
378 void
CreateAliasFile() const379 CBlastDBAliasApp::CreateAliasFile() const
380 {
381 const CArgs& args = GetArgs();
382 string title;
383 bool isTiList = false;
384 if (args.Exist("process_as_tis") && args["process_as_tis"]) {
385 isTiList = true;
386 }
387
388 if (args[kArgDb].HasValue() && !args[kArgGiList].HasValue() &&
389 !args[kArgSeqIdList].HasValue()&& ! args[kArgTaxIdListFile].HasValue()) {
390
391 NCBI_THROW(CInputException, eInvalidInput, "Either gilist or "
392 "seqid_list must be specified if database name is used");
393 }
394
395 if (args[kArgDbTitle].HasValue()) {
396 title = args[kArgDbTitle].AsString();
397 } else if (args[kArgDb].HasValue()) {
398 _ASSERT(args[kArgGiList].HasValue() || args[kArgSeqIdList].HasValue() ||
399 args[kArgTaxIdListFile].HasValue());
400 title = args[kArgDb].AsString() + " limited by ";
401 if (args[kArgGiList]) {
402 title += args[kArgGiList].AsString();
403 }
404 else if (args[kArgSeqIdList]){
405 title += args[kArgSeqIdList].AsString();
406 } else {
407 title += args[kArgTaxIdListFile].AsString();
408 }
409 }
410 const CWriteDB::ESeqType seq_type =
411 args[kArgDbType].AsString() == "prot"
412 ? CWriteDB::eProtein
413 : CWriteDB::eNucleotide;
414
415 string gilist = args[kArgGiList] ? args[kArgGiList].AsString() : kEmptyStr;
416 if ( !gilist.empty() ) {
417 if ( !CFile(gilist).Exists() ) {
418 NCBI_THROW(CSeqDBException, eFileErr, gilist + " not found");
419 }
420 if ( (!isTiList && !SeqDB_IsBinaryGiList(gilist)) ||
421 (isTiList && !SeqDB_IsBinaryTiList(gilist)) ) {
422 const char mol_type = args[kArgDbType].AsString()[0];
423 _ASSERT(mol_type == 'p' || mol_type == 'n');
424 CNcbiOstrstream oss;
425 oss << args[kOutput].AsString() << "." << mol_type <<
426 (isTiList ? ".btl" : ".gil");
427 gilist.assign(CNcbiOstrstreamToString(oss));
428 const string& ifname = args[kArgGiList].AsString();
429 ifstream input(ifname.c_str());
430 ofstream output(gilist.c_str(), std::ios::binary);
431 ConvertGiFile(input, output, &ifname, &gilist);
432 }
433 }
434
435 if (args["dblist"].HasValue()) {
436 //use SeqDBExpert to check if the orginal db exists
437 CSeqDBExpert::ESeqType db_seqtype = seq_type == CWriteDB::eProtein ?
438 CSeqDBExpert::eProtein : CSeqDBExpert::eNucleotide;
439 CSeqDBExpert original_db(args["dblist"].AsString(), db_seqtype);
440 }
441
442 const EAliasFileFilterType alias_type = (isTiList ? eTiList : eGiList);
443 if (args["dblist"].HasValue() || args["dblist_file"].HasValue()) {
444 vector<string> dbs2aggregate = x_GetDbsToAggregate("dblist", "dblist_file");
445 CWriteDB_CreateAliasFile(args[kOutput].AsString(), dbs2aggregate,
446 seq_type, gilist, title, alias_type);
447 } else if (args["num_volumes"].HasValue()) {
448 const unsigned int num_vols =
449 static_cast<unsigned int>(args["num_volumes"].AsInteger());
450 CWriteDB_CreateAliasFile(args[kOutput].AsString(), num_vols, seq_type,
451 title);
452 } else if (args[kArgDb].HasValue() && args[kArgGiList]){
453 CWriteDB_CreateAliasFile(args[kOutput].AsString(),
454 args[kArgDb].AsString(),
455 seq_type, gilist,
456 title, alias_type);
457 } else if (args[kArgDb].HasValue() && args[kArgSeqIdList]){
458 string seqid_list = args[kArgSeqIdList].AsString();
459 if ( !seqid_list.empty() ) {
460 if ( !CFile(seqid_list).Exists() ) {
461 NCBI_THROW(CSeqDBException, eFileErr, seqid_list + " not found");
462 }
463 }
464 CWriteDB_CreateAliasFile(args[kOutput].AsString(),
465 args[kArgDb].AsString(),
466 seq_type, seqid_list,
467 title, eSeqIdList);
468 } else if (args[kArgDb].HasValue() && args[kArgTaxIdListFile]) {
469 string taxid_list = args[kArgTaxIdListFile].AsString();
470 if ( !taxid_list.empty() ) {
471 if ( !CFile(taxid_list).Exists() ) {
472 NCBI_THROW(CSeqDBException, eFileErr, taxid_list + " not found");
473 }
474 }
475 CWriteDB_CreateAliasFile(args[kOutput].AsString(),
476 args[kArgDb].AsString(),
477 seq_type, taxid_list,
478 title, eTaxIdList);
479 }
480
481 if (args["vdblist"].HasValue() || args["vdblist_file"].HasValue()) {
482 CNcbiOstrstream fname;
483 if (args["dblist"].HasValue() || args["dblist_file"].HasValue()) {
484 fname << args[kOutput].AsString() << (seq_type == CWriteDB::eProtein ? ".pal" : ".nal");
485 x_AddVDBsToAliasFile( CNcbiOstrstreamToString(fname), true );
486 }
487 else {
488 fname << args[kOutput].AsString() << (seq_type == CWriteDB::eProtein ? ".pvl" : ".nvl");
489 string title;
490 if(args["title"].HasValue()) {
491 title = args["title"].AsString();
492 }
493 x_AddVDBsToAliasFile( CNcbiOstrstreamToString(fname), false, title);
494 }
495 }
496 }
497
x_AddVDBsToAliasFile(string filename,bool append,string title) const498 void CBlastDBAliasApp::x_AddVDBsToAliasFile( string filename, bool append, string title) const
499 {
500 vector<string> vdbs = x_GetDbsToAggregate("vdblist", "vdblist_file");
501 if(vdbs.empty()) {
502 LOG_POST(Warning <<"Empty vdb list");
503 return;
504 }
505
506 IOS_BASE::openmode op_mode = IOS_BASE::out;
507 if(append) {
508 op_mode |= IOS_BASE::app;
509 }
510 CNcbiOfstream alias_file(filename.c_str(), op_mode);
511
512 if(!append) {
513 alias_file << "#\n# Alias file created " << CTime(CTime::eCurrent).AsString() << "\n#\n";
514 }
515
516 if(kEmptyStr != title) {
517 alias_file << "TITLE " << title << "\n";
518 }
519
520 alias_file << "VDBLIST ";
521 ITERATE(vector< string >, iter, vdbs) {
522 alias_file << "\"" << *iter << "\" ";
523 }
524 alias_file << "\n";
525 }
526
x_GetDbsToAggregate(const string dbs,const string file) const527 vector<string> CBlastDBAliasApp::x_GetDbsToAggregate(const string dbs, const string file) const
528 {
529 vector<string> retval;
530 const CArgs& args = GetArgs();
531 if (args[dbs].HasValue()) {
532 const string dblist = args[dbs].AsString();
533 NStr::Split(dblist, " ", retval);
534 } else if (args[file].HasValue()) {
535 CNcbiIstream& in(args[file].AsInputFile());
536 string line;
537 while (getline(in, line)) {
538 line = NStr::TruncateSpaces(line);
539 if (line.empty()) {
540 continue;
541 }
542 retval.push_back(line);
543 }
544 } else {
545 abort();
546 }
547 return retval;
548 }
549
550
551 int
x_ConvertSeqIDFile() const552 CBlastDBAliasApp::x_ConvertSeqIDFile() const
553 {
554 const CArgs& args = GetArgs();
555 CNcbiIstream& input = args["seqid_file_in"].AsInputFile();
556 string out_filename = kEmptyStr;
557 string title = kEmptyStr;
558 if(args["seqid_file_out"].HasValue()) {
559 out_filename = args["seqid_file_out"].AsString();
560 }
561 else {
562 out_filename = args["seqid_file_in"].AsString() + ".bsl";
563 }
564
565 if(args["seqid_title"].HasValue()) {
566 title = args["seqid_title"].AsString();
567 }
568 else {
569 CSeqDB_Path(args["seqid_file_in"].AsString()).FindFileName().GetString(title);
570 }
571
572 CNcbiOfstream output(out_filename.c_str(), IOS_BASE::binary | IOS_BASE::out);
573 unsigned int line_ctr = 0;
574 vector<string> seqid_list;
575 while (input) {
576 string line;
577 NcbiGetlineEOL(input, line);
578 line_ctr++;
579 if ( !line.empty() ) {
580 if (NStr::StartsWith(line, "#")) continue;
581 seqid_list.push_back(line);
582 }
583 }
584
585 if (args["seqid_db"].HasValue()) {
586 CSeqDB::ESeqType type = CSeqDB::eUnknown;
587 if (args["seqid_dbtype"].HasValue()) {
588 type = (args["seqid_dbtype"].AsString()[0] == 'p') ? CSeqDB::eProtein : CSeqDB::eNucleotide;
589 }
590 CSeqDB seqdb(args["seqid_db"].AsString(), type);
591 return WriteBlastSeqidlistFile(seqid_list, output, title, &seqdb);
592 }
593 else {
594 return WriteBlastSeqidlistFile(seqid_list, output, title);
595 }
596
597 }
598
599 void
x_SeqIDFileInfo() const600 CBlastDBAliasApp::x_SeqIDFileInfo() const
601 {
602 const CArgs& args = GetArgs();
603 CBlastSeqidlistFile::PrintSeqidlistInfo(args["seqid_file_info"].AsString(), std::cout);
604 }
605
606
Run(void)607 int CBlastDBAliasApp::Run(void)
608 {
609 const CArgs& args = GetArgs();
610 int status = 0;
611
612 try {
613
614 if (x_GetOperationMode() == eConvertGiFile) {
615 CNcbiIstream& input = args["gi_file_in"].AsInputFile();
616 string gi_file_out;
617 if (args["gi_file_out"].HasValue()) {
618 gi_file_out = args["gi_file_out"].AsString();
619 } else {
620 gi_file_out = args["gi_file_in"].AsString();
621 gi_file_out += ".bgl";
622 }
623 {
624 // output will close at end of scope.
625 CNcbiOfstream output(gi_file_out.c_str(),std::ios::binary);
626 status = ConvertGiFile(input, output);
627 }
628 if (!CFile(gi_file_out).Exists()) {
629 NCBI_THROW(CSeqDBException, eFileErr, gi_file_out + " not written");
630 }
631 } else if(x_GetOperationMode() == eConvertSeqIDFile) {
632 status = x_ConvertSeqIDFile();
633 } else if(x_GetOperationMode() == eSeqIDFileInfo) {
634 x_SeqIDFileInfo();
635 }
636 else {
637 CreateAliasFile();
638 }
639
640 } CATCH_ALL(status)
641 x_AddCmdOptions();
642 m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, status);
643 return status;
644 }
645
x_AddCmdOptions()646 void CBlastDBAliasApp::x_AddCmdOptions()
647 {
648 const CArgs & args = GetArgs();
649 if (args["gi_file_in"].HasValue()) {
650 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "gi_file_conversion");
651 }
652 else if (args["seqid_file_in"].HasValue()) {
653 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "seqid_file_conversion");
654 }
655 else if (args["seqid_file_info"].HasValue()) {
656 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "get_seqid_file_info");
657 }
658
659 if (args["dblist"].HasValue() || args["dblist_file"].HasValue() || args["num_volumes"].HasValue()) {
660 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_alias_db");
661 }
662 else if (args[kArgDb].HasValue() && args[kArgGiList]){
663 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_gilist_alias_db");
664 }
665 else if (args[kArgDb].HasValue() && args[kArgSeqIdList]){
666 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_seqidlist_alias_db");
667 }
668 else if (args[kArgDb].HasValue() && args[kArgTaxIdListFile]) {
669 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_taxidlist_alias_db");
670 }
671
672 if (args["vdblist"].HasValue() || args["vdblist_file"].HasValue()) {
673 if (args["dblist"].HasValue() || args["dblist_file"].HasValue()) {
674 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "add_vdblist");
675 }
676 else {
677 m_UsageReport.AddParam(CBlastUsageReport::eDBAliasMode, (string) "create_vdb_alias_db");
678 }
679 }
680 }
681
682
683
684 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])685 int main(int argc, const char* argv[] /*, const char* envp[]*/)
686 {
687 return CBlastDBAliasApp().AppMain(argc, argv);
688 }
689 #endif /* SKIP_DOXYGEN_PROCESSING */
690