1 /* $Id: blast_args.cpp 637471 2021-09-13 18:26:17Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's offical duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================*/
25
26 /*****************************************************************************
27
28 File name: blast_args.cpp
29
30 Author: Jason Papadopoulos
31
32 ******************************************************************************/
33
34 /** @file blast_args.cpp
35 * convert blast-related command line
36 * arguments into blast options
37 */
38 #include <ncbi_pch.hpp>
39 #include <corelib/ncbi_system.hpp>
40 #include <algo/blast/api/version.hpp>
41 #include <algo/blast/blastinput/blast_args.hpp>
42 #include <algo/blast/api/blast_exception.hpp>
43 #include <algo/blast/api/blast_aux.hpp>
44 #include <algo/blast/api/objmgr_query_data.hpp> /* for CObjMgrQueryFactory */
45 #include <algo/blast/core/blast_nalookup.h>
46 #include <algo/blast/core/hspfilter_besthit.h>
47 #include <objects/scoremat/PssmWithParameters.hpp>
48 #include <util/format_guess.hpp>
49 #include <util/line_reader.hpp>
50 #include <objtools/blast/seqdb_reader/seqdb.hpp>
51 #include <algo/blast/blastinput/blast_input.hpp> // for CInputException
52 #include <algo/winmask/seq_masker_istat_factory.hpp> // for CSeqMaskerIstatFactory::DiscoverStatType
53 #include <connect/ncbi_connutil.h>
54 #include <objtools/align_format/align_format_util.hpp>
55
56 #include <algo/blast/api/msa_pssm_input.hpp> // for CPsiBlastInputClustalW
57 #include <algo/blast/api/pssm_engine.hpp> // for CPssmEngine
58
59 BEGIN_NCBI_SCOPE
60 BEGIN_SCOPE(blast)
61 USING_SCOPE(objects);
62 USING_SCOPE(align_format);
63
64 void
ExtractAlgorithmOptions(const CArgs &,CBlastOptions &)65 IBlastCmdLineArgs::ExtractAlgorithmOptions(const CArgs& /* cmd_line_args */,
66 CBlastOptions& /* options */)
67 {}
68
CProgramDescriptionArgs(const string & program_name,const string & program_desc)69 CProgramDescriptionArgs::CProgramDescriptionArgs(const string& program_name,
70 const string& program_desc)
71 : m_ProgName(program_name), m_ProgDesc(program_desc)
72 {}
73
74 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)75 CProgramDescriptionArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
76 {
77 // program description
78 arg_desc.SetUsageContext(m_ProgName, m_ProgDesc + " " +
79 CBlastVersion().Print());
80 }
81
CTaskCmdLineArgs(const set<string> & supported_tasks,const string & default_task)82 CTaskCmdLineArgs::CTaskCmdLineArgs(const set<string>& supported_tasks,
83 const string& default_task)
84 : m_SupportedTasks(supported_tasks), m_DefaultTask(default_task)
85 {
86 _ASSERT( !m_SupportedTasks.empty() );
87 if ( !m_DefaultTask.empty() ) {
88 _ASSERT(m_SupportedTasks.find(m_DefaultTask) != m_SupportedTasks.end());
89 }
90 }
91
92 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)93 CTaskCmdLineArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
94 {
95 arg_desc.SetCurrentGroup("General search options");
96 if ( !m_DefaultTask.empty() ) {
97 arg_desc.AddDefaultKey(kTask, "task_name", "Task to execute",
98 CArgDescriptions::eString, m_DefaultTask);
99 } else {
100 arg_desc.AddKey(kTask, "task_name", "Task to execute",
101 CArgDescriptions::eString);
102 }
103 arg_desc.SetConstraint(kTask, new CArgAllowStringSet(m_SupportedTasks));
104 arg_desc.SetCurrentGroup("");
105
106 }
107
108 void
ExtractAlgorithmOptions(const CArgs &,CBlastOptions &)109 CTaskCmdLineArgs::ExtractAlgorithmOptions(const CArgs& /* cmd_line_args */,
110 CBlastOptions& /* options */)
111 {
112 // N.B.: handling of tasks occurs at the application level to ensure that
113 // only relevant tasks are added (@sa CBlastnAppArgs)
114 }
115
116 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)117 CGenericSearchArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
118 {
119 arg_desc.SetCurrentGroup("General search options");
120
121 // evalue cutoff
122 if (!m_IsIgBlast) {
123 arg_desc.AddDefaultKey(kArgEvalue, "evalue",
124 "Expectation value (E) threshold for saving hits ",
125 CArgDescriptions::eDouble,
126 NStr::DoubleToString(BLAST_EXPECT_VALUE));
127 } else if (m_QueryIsProtein) {
128 arg_desc.AddDefaultKey(kArgEvalue, "evalue",
129 "Expectation value (E) threshold for saving hits ",
130 CArgDescriptions::eDouble,
131 NStr::DoubleToString(1.0));
132 } else {
133 //igblastn
134 arg_desc.AddDefaultKey(kArgEvalue, "evalue",
135 "Expectation value (E) threshold for saving hits ",
136 CArgDescriptions::eDouble,
137 NStr::DoubleToString(20.0));
138 }
139
140 // word size
141 // Default values: blastn=11, megablast=28, others=3
142 if(!m_IsRpsBlast) {
143 const string description = m_QueryIsProtein
144 ? "Word size for wordfinder algorithm"
145 : "Word size for wordfinder algorithm (length of best perfect match)";
146 arg_desc.AddOptionalKey(kArgWordSize, "int_value", description,
147 CArgDescriptions::eInteger);
148 arg_desc.SetConstraint(kArgWordSize, m_QueryIsProtein
149 ? new CArgAllowValuesGreaterThanOrEqual(2)
150 : new CArgAllowValuesGreaterThanOrEqual(4));
151 }
152
153 if ( !m_IsRpsBlast && !m_IsTblastx) {
154 // gap open penalty
155 arg_desc.AddOptionalKey(kArgGapOpen, "open_penalty",
156 "Cost to open a gap",
157 CArgDescriptions::eInteger);
158
159 // gap extend penalty
160 arg_desc.AddOptionalKey(kArgGapExtend, "extend_penalty",
161 "Cost to extend a gap",
162 CArgDescriptions::eInteger);
163 }
164
165
166 if (m_ShowPercentIdentity && !m_IsIgBlast) {
167 arg_desc.SetCurrentGroup("Restrict search or results");
168 arg_desc.AddOptionalKey(kArgPercentIdentity, "float_value",
169 "Percent identity",
170 CArgDescriptions::eDouble);
171 arg_desc.SetConstraint(kArgPercentIdentity,
172 new CArgAllow_Doubles(0.0, 100.0));
173 }
174
175 if (!m_IsIgBlast) {
176 arg_desc.SetCurrentGroup("Restrict search or results");
177 arg_desc.AddOptionalKey(kArgQueryCovHspPerc, "float_value",
178 "Percent query coverage per hsp",
179 CArgDescriptions::eDouble);
180 arg_desc.SetConstraint(kArgQueryCovHspPerc,
181 new CArgAllow_Doubles(0.0, 100.0));
182
183 arg_desc.AddOptionalKey(kArgMaxHSPsPerSubject, "int_value",
184 "Set maximum number of HSPs per subject sequence to save for each query",
185 CArgDescriptions::eInteger);
186 arg_desc.SetConstraint(kArgMaxHSPsPerSubject,
187 new CArgAllowValuesGreaterThanOrEqual(1));
188
189 arg_desc.SetCurrentGroup("Extension options");
190 // ungapped X-drop
191 // Default values: blastn=20, megablast=10, others=7
192 arg_desc.AddOptionalKey(kArgUngappedXDropoff, "float_value",
193 "X-dropoff value (in bits) for ungapped extensions",
194 CArgDescriptions::eDouble);
195
196 // Tblastx is ungapped only.
197 if (!m_IsTblastx) {
198 // initial gapped X-drop
199 // Default values: blastn=30, megablast=20, tblastx=0, others=15
200 arg_desc.AddOptionalKey(kArgGappedXDropoff, "float_value",
201 "X-dropoff value (in bits) for preliminary gapped extensions",
202 CArgDescriptions::eDouble);
203
204 // final gapped X-drop
205 // Default values: blastn/megablast=50, tblastx=0, others=25
206 arg_desc.AddOptionalKey(kArgFinalGappedXDropoff, "float_value",
207 "X-dropoff value (in bits) for final gapped alignment",
208 CArgDescriptions::eDouble);
209 }
210 }
211 arg_desc.SetCurrentGroup("Statistical options");
212 // effective search space
213 // Default value is the real size
214 arg_desc.AddOptionalKey(kArgEffSearchSpace, "int_value",
215 "Effective length of the search space",
216 CArgDescriptions::eInt8);
217 arg_desc.SetConstraint(kArgEffSearchSpace,
218 new CArgAllowValuesGreaterThanOrEqual(0));
219
220 if (!m_SuppressSumStats) {
221 arg_desc.AddOptionalKey(kArgSumStats, "bool_value",
222 "Use sum statistics",
223 CArgDescriptions::eBoolean);
224 }
225
226 arg_desc.SetCurrentGroup("");
227 }
228
229 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)230 CGenericSearchArgs::ExtractAlgorithmOptions(const CArgs& args,
231 CBlastOptions& opt)
232 {
233 if (args.Exist(kArgEvalue) && args[kArgEvalue]) {
234 opt.SetEvalueThreshold(args[kArgEvalue].AsDouble());
235 }
236
237 int gap_open=0, gap_extend=0;
238 if (args.Exist(kArgMatrixName) && args[kArgMatrixName])
239 BLAST_GetProteinGapExistenceExtendParams
240 (args[kArgMatrixName].AsString().c_str(), &gap_open, &gap_extend);
241
242 if (args.Exist(kArgGapOpen) && args[kArgGapOpen]) {
243 opt.SetGapOpeningCost(args[kArgGapOpen].AsInteger());
244 }
245 else if (args.Exist(kArgMatrixName) && args[kArgMatrixName]) {
246 opt.SetGapOpeningCost(gap_open);
247 }
248
249 if (args.Exist(kArgGapExtend) && args[kArgGapExtend]) {
250 opt.SetGapExtensionCost(args[kArgGapExtend].AsInteger());
251 }
252 else if (args.Exist(kArgMatrixName) && args[kArgMatrixName]) {
253 opt.SetGapExtensionCost(gap_extend);
254 }
255
256 if (args.Exist(kArgUngappedXDropoff) && args[kArgUngappedXDropoff]) {
257 opt.SetXDropoff(args[kArgUngappedXDropoff].AsDouble());
258 }
259
260 if (args.Exist(kArgGappedXDropoff) && args[kArgGappedXDropoff]) {
261 opt.SetGapXDropoff(args[kArgGappedXDropoff].AsDouble());
262 }
263
264 if (args.Exist(kArgFinalGappedXDropoff) && args[kArgFinalGappedXDropoff]) {
265 opt.SetGapXDropoffFinal(args[kArgFinalGappedXDropoff].AsDouble());
266 }
267
268 if ( args.Exist(kArgWordSize) && args[kArgWordSize]) {
269 if (m_QueryIsProtein && args[kArgWordSize].AsInteger() > 5)
270 opt.SetLookupTableType(eCompressedAaLookupTable);
271 opt.SetWordSize(args[kArgWordSize].AsInteger());
272 }
273
274 if (args.Exist(kArgEffSearchSpace) && args[kArgEffSearchSpace]) {
275 CNcbiEnvironment env;
276 env.Set("OLD_FSC", "true");
277 opt.SetEffectiveSearchSpace(args[kArgEffSearchSpace].AsInt8());
278 }
279
280 if (args.Exist(kArgPercentIdentity) && args[kArgPercentIdentity]) {
281 opt.SetPercentIdentity(args[kArgPercentIdentity].AsDouble());
282 }
283
284 if (args.Exist(kArgQueryCovHspPerc) && args[kArgQueryCovHspPerc]) {
285 opt.SetQueryCovHspPerc(args[kArgQueryCovHspPerc].AsDouble());
286 }
287
288 if (args.Exist(kArgMaxHSPsPerSubject) && args[kArgMaxHSPsPerSubject]) {
289 opt.SetMaxHspsPerSubject(args[kArgMaxHSPsPerSubject].AsInteger());
290 }
291
292 if (args.Exist(kArgSumStats) && args[kArgSumStats]) {
293 opt.SetSumStatisticsMode(args[kArgSumStats].AsBoolean());
294 }
295 }
296
297 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)298 CFilteringArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
299 {
300 arg_desc.SetCurrentGroup("Query filtering options");
301
302 if (m_QueryIsProtein) {
303 arg_desc.AddDefaultKey(kArgSegFiltering, "SEG_options",
304 "Filter query sequence with SEG "
305 "(Format: '" + kDfltArgApplyFiltering + "', " +
306 "'window locut hicut', or '" + kDfltArgNoFiltering +
307 "' to disable)",
308 CArgDescriptions::eString, m_FilterByDefault
309 ? kDfltArgSegFiltering : kDfltArgNoFiltering);
310 arg_desc.AddDefaultKey(kArgLookupTableMaskingOnly, "soft_masking",
311 "Apply filtering locations as soft masks",
312 CArgDescriptions::eBoolean,
313 kDfltArgLookupTableMaskingOnlyProt);
314 } else {
315 arg_desc.AddDefaultKey(kArgDustFiltering, "DUST_options",
316 "Filter query sequence with DUST "
317 "(Format: '" + kDfltArgApplyFiltering + "', " +
318 "'level window linker', or '" + kDfltArgNoFiltering +
319 "' to disable)",
320 CArgDescriptions::eString, m_FilterByDefault
321 ? kDfltArgDustFiltering : kDfltArgNoFiltering);
322 arg_desc.AddOptionalKey(kArgFilteringDb, "filtering_database",
323 "BLAST database containing filtering elements (i.e.: repeats)",
324 CArgDescriptions::eString);
325
326 arg_desc.AddOptionalKey(kArgWindowMaskerTaxId, "window_masker_taxid",
327 "Enable WindowMasker filtering using a Taxonomic ID",
328 CArgDescriptions::eInteger);
329
330 arg_desc.AddOptionalKey(kArgWindowMaskerDatabase, "window_masker_db",
331 "Enable WindowMasker filtering using this repeats database.",
332 CArgDescriptions::eString);
333
334 arg_desc.AddDefaultKey(kArgLookupTableMaskingOnly, "soft_masking",
335 "Apply filtering locations as soft masks",
336 CArgDescriptions::eBoolean,
337 kDfltArgLookupTableMaskingOnlyNucl);
338 }
339
340 arg_desc.SetCurrentGroup("");
341 }
342
343 void
x_TokenizeFilteringArgs(const string & filtering_args,vector<string> & output) const344 CFilteringArgs::x_TokenizeFilteringArgs(const string& filtering_args,
345 vector<string>& output) const
346 {
347 output.clear();
348 NStr::Split(filtering_args, " ", output);
349 if (output.size() != 3) {
350 NCBI_THROW(CInputException, eInvalidInput,
351 "Invalid number of arguments to filtering option");
352 }
353 }
354
355 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)356 CFilteringArgs::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& opt)
357 {
358 if (args[kArgLookupTableMaskingOnly]) {
359 opt.SetMaskAtHash(args[kArgLookupTableMaskingOnly].AsBoolean());
360 }
361
362 vector<string> tokens;
363
364 try {
365 if (m_QueryIsProtein && args[kArgSegFiltering]) {
366 const string& seg_opts = args[kArgSegFiltering].AsString();
367 if (seg_opts == kDfltArgNoFiltering) {
368 opt.SetSegFiltering(false);
369 } else if (seg_opts == kDfltArgApplyFiltering) {
370 opt.SetSegFiltering(true);
371 } else {
372 x_TokenizeFilteringArgs(seg_opts, tokens);
373 opt.SetSegFilteringWindow(NStr::StringToInt(tokens[0]));
374 opt.SetSegFilteringLocut(NStr::StringToDouble(tokens[1]));
375 opt.SetSegFilteringHicut(NStr::StringToDouble(tokens[2]));
376 }
377 }
378
379 if ( !m_QueryIsProtein && args[kArgDustFiltering]) {
380 const string& dust_opts = args[kArgDustFiltering].AsString();
381 if (dust_opts == kDfltArgNoFiltering) {
382 opt.SetDustFiltering(false);
383 } else if (dust_opts == kDfltArgApplyFiltering) {
384 opt.SetDustFiltering(true);
385 } else {
386 x_TokenizeFilteringArgs(dust_opts, tokens);
387 opt.SetDustFilteringLevel(NStr::StringToInt(tokens[0]));
388 opt.SetDustFilteringWindow(NStr::StringToInt(tokens[1]));
389 opt.SetDustFilteringLinker(NStr::StringToInt(tokens[2]));
390 }
391 }
392 } catch (const CStringException& e) {
393 if (e.GetErrCode() == CStringException::eConvert) {
394 NCBI_THROW(CInputException, eInvalidInput,
395 "Invalid input for filtering parameters");
396 }
397 }
398
399 int filter_dbs = 0;
400
401 if (args.Exist(kArgFilteringDb) && args[kArgFilteringDb]) {
402 opt.SetRepeatFilteringDB(args[kArgFilteringDb].AsString().c_str());
403 filter_dbs++;
404 }
405
406 if (args.Exist(kArgWindowMaskerTaxId) &&
407 args[kArgWindowMaskerTaxId]) {
408
409 opt.SetWindowMaskerTaxId
410 (args[kArgWindowMaskerTaxId].AsInteger());
411
412 filter_dbs++;
413 }
414
415 if (args.Exist(kArgWindowMaskerDatabase) &&
416 args[kArgWindowMaskerDatabase]) {
417 const string& stat_file = args[kArgWindowMaskerDatabase].AsString();
418 const CSeqMaskerIstatFactory::EStatType type =
419 CSeqMaskerIstatFactory::DiscoverStatType(stat_file);
420 if (type != CSeqMaskerIstatFactory::eOBinary &&
421 type != CSeqMaskerIstatFactory::eBinary) {
422 string msg("Only optimized binary windowmasker stat files are supported");
423 NCBI_THROW(CInputException, eInvalidInput, msg);
424 }
425
426 opt.SetWindowMaskerDatabase(stat_file.c_str());
427 filter_dbs++;
428 }
429
430 if (filter_dbs > 1) {
431 string msg =
432 string("Please specify at most one of ") + kArgFilteringDb + ", " +
433 kArgWindowMaskerTaxId + ", or " + kArgWindowMaskerDatabase + ".";
434
435 NCBI_THROW(CInputException, eInvalidInput, msg);
436 }
437 }
438
439 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)440 CWindowSizeArg::SetArgumentDescriptions(CArgDescriptions& arg_desc)
441 {
442 arg_desc.SetCurrentGroup("Extension options");
443 // 2-hit wordfinder window size
444 arg_desc.AddOptionalKey(kArgWindowSize, "int_value",
445 "Multiple hits window size, use 0 to specify "
446 "1-hit algorithm",
447 CArgDescriptions::eInteger);
448 arg_desc.SetConstraint(kArgWindowSize,
449 new CArgAllowValuesGreaterThanOrEqual(0));
450 arg_desc.SetCurrentGroup("");
451 }
452
453 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)454 CWindowSizeArg::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& opt)
455 {
456 if (args[kArgWindowSize]) {
457 opt.SetWindowSize(args[kArgWindowSize].AsInteger());
458 } else {
459 int window = -1;
460 BLAST_GetSuggestedWindowSize(opt.GetProgramType(),
461 opt.GetMatrixName(),
462 &window);
463 if (window != -1) {
464 opt.SetWindowSize(window);
465 }
466 }
467 }
468
469 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)470 COffDiagonalRangeArg::SetArgumentDescriptions(CArgDescriptions& arg_desc)
471 {
472 arg_desc.SetCurrentGroup("Extension options");
473 // 2-hit wordfinder off diagonal range
474 arg_desc.AddDefaultKey(kArgOffDiagonalRange, "int_value",
475 "Number of off-diagonals to search for the 2nd hit, "
476 "use 0 to turn off",
477 CArgDescriptions::eInteger,
478 NStr::IntToString(kDfltOffDiagonalRange));
479 arg_desc.SetConstraint(kArgOffDiagonalRange,
480 new CArgAllowValuesGreaterThanOrEqual(0));
481 arg_desc.SetCurrentGroup("");
482 }
483
484 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)485 COffDiagonalRangeArg::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& opt)
486 {
487 if (args[kArgOffDiagonalRange]) {
488 opt.SetOffDiagonalRange(args[kArgOffDiagonalRange].AsInteger());
489 } else {
490 opt.SetOffDiagonalRange(0);
491 }
492 }
493
494 // Options specific to rmblastn -RMH-
495 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)496 CRMBlastNArg::SetArgumentDescriptions(CArgDescriptions& arg_desc)
497 {
498 arg_desc.SetCurrentGroup("General search options");
499
500 arg_desc.AddDefaultKey(kArgMatrixName, "matrix_name",
501 "Scoring matrix name",
502 CArgDescriptions::eString,
503 string(""));
504
505 arg_desc.AddFlag(kArgComplexityAdj,
506 "Use complexity adjusted scoring",
507 true);
508
509
510 arg_desc.AddDefaultKey(kArgMaskLevel, "int_value",
511 "Masklevel - percentage overlap allowed per "
512 "query domain [0-101]",
513 CArgDescriptions::eInteger,
514 kDfltArgMaskLevel);
515 arg_desc.SetConstraint(kArgMaskLevel,
516 new CArgAllowValuesLessThanOrEqual(101));
517
518 arg_desc.SetCurrentGroup("");
519 }
520
521 // Options specific to rmblastn -RMH-
522 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)523 CRMBlastNArg::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& opt)
524 {
525 if (args[kArgMatrixName]) {
526 opt.SetMatrixName(args[kArgMatrixName].AsString().c_str());
527 }
528
529 opt.SetComplexityAdjMode( args[kArgComplexityAdj] );
530
531 if (args[kArgMaskLevel]) {
532 opt.SetMaskLevel(args[kArgMaskLevel].AsInteger());
533 }
534
535 if (args[kArgMinRawGappedScore]) {
536 opt.SetCutoffScore(args[kArgMinRawGappedScore].AsInteger());
537 }else if (args[kArgUngappedXDropoff]) {
538 opt.SetCutoffScore(args[kArgUngappedXDropoff].AsInteger());
539 }
540 }
541
542 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)543 CWordThresholdArg::SetArgumentDescriptions(CArgDescriptions& arg_desc)
544 {
545 arg_desc.SetCurrentGroup("General search options");
546 // lookup table word score threshold
547 arg_desc.AddOptionalKey(kArgWordScoreThreshold, "float_value",
548 "Minimum word score such that the word is added to the "
549 "BLAST lookup table",
550 CArgDescriptions::eDouble);
551 arg_desc.SetConstraint(kArgWordScoreThreshold,
552 new CArgAllowValuesGreaterThanOrEqual(0));
553 arg_desc.SetCurrentGroup("");
554 }
555
556 static bool
s_IsDefaultWordThreshold(EProgram program,double threshold)557 s_IsDefaultWordThreshold(EProgram program, double threshold)
558 {
559 int word_threshold = static_cast<int>(threshold);
560 bool retval = true;
561 if (program == eBlastp &&
562 word_threshold != BLAST_WORD_THRESHOLD_BLASTP) {
563 retval = false;
564 } else if (program == eBlastx &&
565 word_threshold != BLAST_WORD_THRESHOLD_BLASTX) {
566 retval = false;
567 } else if (program == eTblastn &&
568 word_threshold != BLAST_WORD_THRESHOLD_TBLASTN) {
569 retval = false;
570 }
571 return retval;
572 }
573
574 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)575 CWordThresholdArg::ExtractAlgorithmOptions(const CArgs& args,
576 CBlastOptions& opt)
577 {
578 if (args[kArgWordScoreThreshold]) {
579 opt.SetWordThreshold(args[kArgWordScoreThreshold].AsDouble());
580 } else if (s_IsDefaultWordThreshold(opt.GetProgram(),
581 opt.GetWordThreshold())) {
582 double threshold = -1;
583 BLAST_GetSuggestedThreshold(opt.GetProgramType(),
584 opt.GetMatrixName(),
585 &threshold);
586 if (threshold != -1) {
587 opt.SetWordThreshold(threshold);
588 }
589 }
590 }
591
592 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)593 CMatrixNameArg::SetArgumentDescriptions(CArgDescriptions& arg_desc)
594 {
595 arg_desc.SetCurrentGroup("General search options");
596 arg_desc.AddOptionalKey(kArgMatrixName, "matrix_name",
597 "Scoring matrix name (normally BLOSUM62)",
598 CArgDescriptions::eString);
599 arg_desc.SetCurrentGroup("");
600 }
601
602 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)603 CMatrixNameArg::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& opt)
604 {
605 if (args[kArgMatrixName]) {
606 opt.SetMatrixName(args[kArgMatrixName].AsString().c_str());
607 }
608 }
609
610 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)611 CNuclArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
612 {
613 // TLM arg_desc.SetCurrentGroup("Nucleotide scoring options");
614
615 arg_desc.SetCurrentGroup("General search options");
616 // blastn mismatch penalty
617 arg_desc.AddOptionalKey(kArgMismatch, "penalty",
618 "Penalty for a nucleotide mismatch",
619 CArgDescriptions::eInteger);
620 arg_desc.SetConstraint(kArgMismatch,
621 new CArgAllowValuesLessThanOrEqual(0));
622
623 // blastn match reward
624 arg_desc.AddOptionalKey(kArgMatch, "reward",
625 "Reward for a nucleotide match",
626 CArgDescriptions::eInteger);
627 arg_desc.SetConstraint(kArgMatch,
628 new CArgAllowValuesGreaterThanOrEqual(0));
629
630
631 arg_desc.SetCurrentGroup("Extension options");
632 arg_desc.AddFlag(kArgNoGreedyExtension,
633 "Use non-greedy dynamic programming extension",
634 true);
635
636 arg_desc.SetCurrentGroup("");
637 }
638
639 void
ExtractAlgorithmOptions(const CArgs & cmd_line_args,CBlastOptions & options)640 CNuclArgs::ExtractAlgorithmOptions(const CArgs& cmd_line_args,
641 CBlastOptions& options)
642 {
643 if (cmd_line_args.Exist(kArgMismatch) && cmd_line_args[kArgMismatch]) {
644 options.SetMismatchPenalty(cmd_line_args[kArgMismatch].AsInteger());
645 }
646 if (cmd_line_args.Exist(kArgMatch) && cmd_line_args[kArgMatch]) {
647 options.SetMatchReward(cmd_line_args[kArgMatch].AsInteger());
648 }
649
650 if (cmd_line_args.Exist(kArgNoGreedyExtension) &&
651 cmd_line_args[kArgNoGreedyExtension]) {
652 options.SetGapExtnAlgorithm(eDynProgScoreOnly);
653 options.SetGapTracebackAlgorithm(eDynProgTbck);
654 }
655 }
656
657 const string CDiscontiguousMegablastArgs::kTemplType_Coding("coding");
658 const string CDiscontiguousMegablastArgs::kTemplType_Optimal("optimal");
659 const string
660 CDiscontiguousMegablastArgs::kTemplType_CodingAndOptimal("coding_and_optimal");
661
662 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)663 CDiscontiguousMegablastArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
664 {
665 arg_desc.SetCurrentGroup("Extension options");
666 // FIXME: this can be applied to any program, but since it was only offered
667 // in megablast, we're putting it here
668 arg_desc.AddOptionalKey(kArgMinRawGappedScore, "int_value",
669 "Minimum raw gapped score to keep an alignment "
670 "in the preliminary gapped and traceback stages",
671 CArgDescriptions::eInteger);
672
673 arg_desc.SetCurrentGroup("Discontiguous MegaBLAST options");
674
675 arg_desc.AddOptionalKey(kArgDMBTemplateType, "type",
676 "Discontiguous MegaBLAST template type",
677 CArgDescriptions::eString);
678 arg_desc.SetConstraint(kArgDMBTemplateType, &(*new CArgAllow_Strings,
679 kTemplType_Coding,
680 kTemplType_Optimal,
681 kTemplType_CodingAndOptimal));
682 arg_desc.SetDependency(kArgDMBTemplateType,
683 CArgDescriptions::eRequires,
684 kArgDMBTemplateLength);
685
686 arg_desc.AddOptionalKey(kArgDMBTemplateLength, "int_value",
687 "Discontiguous MegaBLAST template length",
688 CArgDescriptions::eInteger);
689 set<int> allowed_values;
690 allowed_values.insert(16);
691 allowed_values.insert(18);
692 allowed_values.insert(21);
693 arg_desc.SetConstraint(kArgDMBTemplateLength,
694 new CArgAllowIntegerSet(allowed_values));
695 arg_desc.SetDependency(kArgDMBTemplateLength,
696 CArgDescriptions::eRequires,
697 kArgDMBTemplateType);
698
699 arg_desc.SetCurrentGroup("");
700 }
701
702 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & options)703 CDiscontiguousMegablastArgs::ExtractAlgorithmOptions(const CArgs& args,
704 CBlastOptions& options)
705 {
706 if (args[kArgMinRawGappedScore]) {
707 options.SetCutoffScore(args[kArgMinRawGappedScore].AsInteger());
708 }
709
710 if (args[kArgDMBTemplateType]) {
711 const string& type = args[kArgDMBTemplateType].AsString();
712 EDiscWordType temp_type = eMBWordCoding;
713
714 if (type == kTemplType_Coding) {
715 temp_type = eMBWordCoding;
716 } else if (type == kTemplType_Optimal) {
717 temp_type = eMBWordOptimal;
718 } else if (type == kTemplType_CodingAndOptimal) {
719 temp_type = eMBWordTwoTemplates;
720 } else {
721 abort();
722 }
723 options.SetMBTemplateType(static_cast<unsigned char>(temp_type));
724 }
725
726 if (args[kArgDMBTemplateLength]) {
727 unsigned char tlen =
728 static_cast<unsigned char>(args[kArgDMBTemplateLength].AsInteger());
729 options.SetMBTemplateLength(tlen);
730 }
731
732 // FIXME: should the window size be adjusted if this is set?
733 }
734
735 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)736 CCompositionBasedStatsArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
737 {
738 arg_desc.SetCurrentGroup("General search options");
739 // composition based statistics, keep in sync with ECompoAdjustModes
740 // documentation in composition_constants.h
741
742 string zero_opt = !m_ZeroOptDescr.empty() ?
743 (string)" 0 or F or f: " + m_ZeroOptDescr + "\n" :
744 " 0 or F or f: No composition-based statistics\n";
745
746 string one_opt_insrt = m_Is2and3Supported ? "" : " or T or t";
747
748 string more_opts = m_Is2and3Supported ?
749 " 2 or T or t : Composition-based score adjustment as in "
750 "Bioinformatics 21:902-911,\n"
751 " 2005, conditioned on sequence properties\n"
752 " 3: Composition-based score adjustment as in "
753 "Bioinformatics 21:902-911,\n"
754 " 2005, unconditionally\n" : "";
755
756 string legend = (string)"Use composition-based statistics:\n"
757 " D or d: default (equivalent to " + m_DefaultOpt + " )\n"
758 + zero_opt
759 + " 1" + one_opt_insrt + ": Composition-based statistics "
760 "as in NAR 29:2994-3005, 2001\n"
761 + more_opts;
762
763 arg_desc.AddDefaultKey(kArgCompBasedStats, "compo", legend,
764 CArgDescriptions::eString, m_DefaultOpt);
765
766
767 arg_desc.SetCurrentGroup("Miscellaneous options");
768 // Use Smith-Waterman algorithm in traceback stage
769 // FIXME: available only for gapped blastp/tblastn, and with
770 // composition-based statistics
771 arg_desc.AddFlag(kArgUseSWTraceback,
772 "Compute locally optimal Smith-Waterman alignments?",
773 true);
774 arg_desc.SetCurrentGroup("");
775 }
776
777 /**
778 * @brief Auxiliary function to set the composition based statistics and smith
779 * waterman options
780 *
781 * @param opt BLAST options object [in|out]
782 * @param comp_stat_string command line value for composition based statistics
783 * [in]
784 * @param smith_waterman_value command line value for determining the use of
785 * the smith-waterman algorithm [in]
786 * @param ungapped pointer to the value which determines whether the search
787 * should be ungapped or not. It is NULL if ungapped searches are not
788 * applicable
789 * @param is_deltablast is program deltablast [in]
790 */
791 static void
s_SetCompositionBasedStats(CBlastOptions & opt,const string & comp_stat_string,bool smith_waterman_value,bool * ungapped)792 s_SetCompositionBasedStats(CBlastOptions& opt,
793 const string& comp_stat_string,
794 bool smith_waterman_value,
795 bool* ungapped)
796 {
797 const EProgram program = opt.GetProgram();
798 if (program == eBlastp || program == eTblastn ||
799 program == ePSIBlast || program == ePSITblastn ||
800 program == eRPSBlast || program == eRPSTblastn ||
801 program == eBlastx || program == eDeltaBlast) {
802
803 ECompoAdjustModes compo_mode = eNoCompositionBasedStats;
804
805 switch (comp_stat_string[0]) {
806 case '0': case 'F': case 'f':
807 compo_mode = eNoCompositionBasedStats;
808 break;
809 case '1':
810 compo_mode = eCompositionBasedStats;
811 break;
812 case 'D': case 'd':
813 if ((program == eRPSBlast) || (program == eRPSTblastn)) {
814 compo_mode = eNoCompositionBasedStats;
815 }
816 else if (program == eDeltaBlast) {
817 compo_mode = eCompositionBasedStats;
818 }
819 else {
820 compo_mode = eCompositionMatrixAdjust;
821 }
822 break;
823 case '2':
824 compo_mode = eCompositionMatrixAdjust;
825 break;
826 case '3':
827 compo_mode = eCompoForceFullMatrixAdjust;
828 break;
829 case 'T': case 't':
830 compo_mode = (program == eRPSBlast || program == eRPSTblastn || program == eDeltaBlast) ?
831 eCompositionBasedStats : eCompositionMatrixAdjust;
832 break;
833 }
834
835 if(program == ePSITblastn) {
836 compo_mode = eNoCompositionBasedStats;
837 }
838
839 if (ungapped && *ungapped && compo_mode != eNoCompositionBasedStats) {
840 NCBI_THROW(CInputException, eInvalidInput,
841 "Composition-adjusted searched are not supported with "
842 "an ungapped search, please add -comp_based_stats F or "
843 "do a gapped search");
844 }
845
846 opt.SetCompositionBasedStats(compo_mode);
847 if (program == eBlastp &&
848 compo_mode != eNoCompositionBasedStats &&
849 tolower(comp_stat_string[1]) == 'u') {
850 opt.SetUnifiedP(1);
851 }
852 opt.SetSmithWatermanMode(smith_waterman_value);
853 }
854 }
855
856 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)857 CCompositionBasedStatsArgs::ExtractAlgorithmOptions(const CArgs& args,
858 CBlastOptions& opt)
859 {
860 if (args[kArgCompBasedStats]) {
861 unique_ptr<bool> ungapped(args.Exist(kArgUngapped)
862 ? new bool(args[kArgUngapped]) : 0);
863 s_SetCompositionBasedStats(opt,
864 args[kArgCompBasedStats].AsString(),
865 args[kArgUseSWTraceback],
866 ungapped.get());
867 }
868
869 }
870
871 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)872 CGappedArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
873 {
874 // perform gapped search
875 #if 0
876 arg_desc.AddOptionalKey(ARG_GAPPED, "gapped",
877 "Perform gapped alignment (default T, but "
878 "not available for tblastx)",
879 CArgDescriptions::eBoolean,
880 CArgDescriptions::fOptionalSeparator);
881 arg_desc.AddAlias("-gapped", ARG_GAPPED);
882 #endif
883 arg_desc.SetCurrentGroup("Extension options");
884 arg_desc.AddFlag(kArgUngapped, "Perform ungapped alignment only?", true);
885 arg_desc.SetCurrentGroup("");
886 }
887
888 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & options)889 CGappedArgs::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& options)
890 {
891 #if 0
892 if (args[ARG_GAPPED] && options.GetProgram() != eTblastx) {
893 options.SetGappedMode(args[ARG_GAPPED].AsBoolean());
894 }
895 #endif
896 options.SetGappedMode( !args[kArgUngapped] );
897 }
898
899 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)900 CLargestIntronSizeArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
901 {
902 arg_desc.SetCurrentGroup("General search options");
903 // largest intron length
904 arg_desc.AddDefaultKey(kArgMaxIntronLength, "length",
905 "Length of the largest intron allowed in a translated "
906 "nucleotide sequence when linking multiple distinct "
907 "alignments",
908 CArgDescriptions::eInteger,
909 NStr::IntToString(kDfltArgMaxIntronLength));
910 arg_desc.SetConstraint(kArgMaxIntronLength,
911 new CArgAllowValuesGreaterThanOrEqual(0));
912 arg_desc.SetCurrentGroup("");
913 }
914
915 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)916 CLargestIntronSizeArgs::ExtractAlgorithmOptions(const CArgs& args,
917 CBlastOptions& opt)
918 {
919 if ( !args[kArgMaxIntronLength] ) {
920 return;
921 }
922
923 // sum statistics are defauled to be on unless a cmdline option is set
924 opt.SetLongestIntronLength(args[kArgMaxIntronLength].AsInteger());
925
926 }
927
928 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)929 CFrameShiftArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
930 {
931 arg_desc.SetCurrentGroup("General search options");
932 // applicable in blastx/tblastn, off by default
933 arg_desc.AddOptionalKey(kArgFrameShiftPenalty, "frameshift",
934 "Frame shift penalty (for use with out-of-frame "
935 "gapped alignment in blastx or tblastn, default "
936 "ignored)",
937 CArgDescriptions::eInteger);
938 arg_desc.SetConstraint(kArgFrameShiftPenalty,
939 new CArgAllowValuesGreaterThanOrEqual(1));
940 arg_desc.SetDependency(kArgFrameShiftPenalty, CArgDescriptions::eExcludes,kArgUngapped);
941 arg_desc.SetCurrentGroup("");
942 }
943
944 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)945 CFrameShiftArgs::ExtractAlgorithmOptions(const CArgs& args,
946 CBlastOptions& opt)
947 {
948 if (args[kArgFrameShiftPenalty]) {
949 if (args[kArgCompBasedStats]) {
950 string cbs = args[kArgCompBasedStats].AsString();
951
952 if ((cbs[0] != '0' )&& (cbs[0] != 'F') && (cbs[0] != 'f')) {
953 NCBI_THROW(CInputException, eInvalidInput,
954 "Composition-adjusted searches are not supported with "
955 "Out-Of-Frame option, please add -comp_based_stats F ");
956 }
957 }
958
959 opt.SetOutOfFrameMode();
960 opt.SetFrameShiftPenalty(args[kArgFrameShiftPenalty].AsInteger());
961 }
962 }
963
964 /// Auxiliary class to validate the genetic code input
965 class CArgAllowGeneticCodeInteger : public CArgAllow
966 {
967 protected:
968 /// Overloaded method from CArgAllow
Verify(const string & value) const969 virtual bool Verify(const string& value) const {
970 static int gcs[] = {1,2,3,4,5,6,9,10,11,12,13,14,15,16,21,22,23,24,25,26,27,28,29,30,31,33};
971 static const set<int> genetic_codes(gcs, gcs+sizeof(gcs)/sizeof(*gcs));
972 const int val = NStr::StringToInt(value);
973 return (genetic_codes.find(val) != genetic_codes.end());
974 }
975
976 /// Overloaded method from CArgAllow
GetUsage(void) const977 virtual string GetUsage(void) const {
978 return "values between: 1-6, 9-16, 21-31, 33";
979 }
980 };
981
982 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)983 CGeneticCodeArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
984 {
985 if (m_Target == eQuery) {
986 arg_desc.SetCurrentGroup("Input query options");
987 // query genetic code
988 arg_desc.AddDefaultKey(kArgQueryGeneticCode, "int_value",
989 "Genetic code to use to translate query (see https://www.ncbi.nlm.nih.gov/Taxonomy/taxonomyhome.html/index.cgi?chapter=cgencodes for details)\n",
990 CArgDescriptions::eInteger,
991 NStr::IntToString(BLAST_GENETIC_CODE));
992 arg_desc.SetConstraint(kArgQueryGeneticCode,
993 new CArgAllowGeneticCodeInteger());
994 } else {
995 arg_desc.SetCurrentGroup("General search options");
996 // DB genetic code
997 arg_desc.AddDefaultKey(kArgDbGeneticCode, "int_value",
998 "Genetic code to use to translate "
999 "database/subjects (see user manual for details)\n",
1000 CArgDescriptions::eInteger,
1001 NStr::IntToString(BLAST_GENETIC_CODE));
1002 arg_desc.SetConstraint(kArgDbGeneticCode,
1003 new CArgAllowGeneticCodeInteger());
1004
1005 }
1006 arg_desc.SetCurrentGroup("");
1007 }
1008
1009 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1010 CGeneticCodeArgs::ExtractAlgorithmOptions(const CArgs& args,
1011 CBlastOptions& opt)
1012 {
1013 const EProgram program = opt.GetProgram();
1014
1015 if (m_Target == eQuery && args[kArgQueryGeneticCode]) {
1016 opt.SetQueryGeneticCode(args[kArgQueryGeneticCode].AsInteger());
1017 }
1018
1019 if (m_Target == eDatabase && args[kArgDbGeneticCode] &&
1020 (program == eTblastn || program == eTblastx) ) {
1021 opt.SetDbGeneticCode(args[kArgDbGeneticCode].AsInteger());
1022 }
1023 }
1024
1025 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1026 CGapTriggerArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1027 {
1028 arg_desc.SetCurrentGroup("Extension options");
1029
1030 const double default_value = m_QueryIsProtein
1031 ? BLAST_GAP_TRIGGER_PROT : BLAST_GAP_TRIGGER_NUCL;
1032 arg_desc.AddDefaultKey(kArgGapTrigger, "float_value",
1033 "Number of bits to trigger gapping",
1034 CArgDescriptions::eDouble,
1035 NStr::DoubleToString(default_value));
1036 arg_desc.SetCurrentGroup("");
1037 }
1038
1039 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1040 CGapTriggerArgs::ExtractAlgorithmOptions(const CArgs& args,
1041 CBlastOptions& opt)
1042 {
1043 if (args[kArgGapTrigger]) {
1044 opt.SetGapTrigger(args[kArgGapTrigger].AsDouble());
1045 }
1046 }
1047
1048 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1049 CPssmEngineArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1050 {
1051 arg_desc.SetCurrentGroup("PSSM engine options");
1052
1053 // Pseudo count
1054 arg_desc.AddDefaultKey(kArgPSIPseudocount, "pseudocount",
1055 "Pseudo-count value used when constructing PSSM",
1056 CArgDescriptions::eInteger,
1057 NStr::IntToString(PSI_PSEUDO_COUNT_CONST));
1058
1059 if (m_IsDeltaBlast) {
1060 arg_desc.AddDefaultKey(kArgDomainInclusionEThreshold, "ethresh",
1061 "E-value inclusion threshold for alignments "
1062 "with conserved domains",
1063 CArgDescriptions::eDouble,
1064 NStr::DoubleToString(DELTA_INCLUSION_ETHRESH));
1065 }
1066
1067 // Evalue inclusion threshold
1068 arg_desc.AddDefaultKey(kArgPSIInclusionEThreshold, "ethresh",
1069 "E-value inclusion threshold for pairwise alignments",
1070 CArgDescriptions::eDouble,
1071 NStr::DoubleToString(PSI_INCLUSION_ETHRESH));
1072
1073 arg_desc.SetCurrentGroup("");
1074 }
1075
1076 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1077 CPssmEngineArgs::ExtractAlgorithmOptions(const CArgs& args,
1078 CBlastOptions& opt)
1079 {
1080 if (args[kArgPSIPseudocount]) {
1081 opt.SetPseudoCount(args[kArgPSIPseudocount].AsInteger());
1082 }
1083
1084 if (args[kArgPSIInclusionEThreshold]) {
1085 opt.SetInclusionThreshold(args[kArgPSIInclusionEThreshold].AsDouble());
1086 }
1087
1088 if (args.Exist(kArgDomainInclusionEThreshold)
1089 && args[kArgDomainInclusionEThreshold]) {
1090
1091 opt.SetDomainInclusionThreshold(
1092 args[kArgDomainInclusionEThreshold].AsDouble());
1093 }
1094 }
1095
1096 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1097 CPsiBlastArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1098 {
1099
1100 if (m_DbTarget == eNucleotideDb) {
1101 arg_desc.SetCurrentGroup("PSI-TBLASTN options");
1102
1103 // PSI-tblastn checkpoint
1104 arg_desc.AddOptionalKey(kArgPSIInputChkPntFile, "psi_chkpt_file",
1105 "PSI-TBLASTN checkpoint file",
1106 CArgDescriptions::eInputFile);
1107 arg_desc.SetDependency(kArgPSIInputChkPntFile,
1108 CArgDescriptions::eExcludes,
1109 kArgRemote);
1110 } else {
1111 arg_desc.SetCurrentGroup("PSI-BLAST options");
1112
1113 // Number of iterations
1114 arg_desc.AddDefaultKey(kArgPSINumIterations, "int_value",
1115 "Number of iterations to perform (0 means run "
1116 "until convergence)", CArgDescriptions::eInteger,
1117 NStr::IntToString(kDfltArgPSINumIterations));
1118 arg_desc.SetConstraint(kArgPSINumIterations,
1119 new CArgAllowValuesGreaterThanOrEqual(0));
1120 arg_desc.SetDependency(kArgPSINumIterations,
1121 CArgDescriptions::eExcludes,
1122 kArgRemote);
1123 // checkpoint file
1124 arg_desc.AddOptionalKey(kArgPSIOutputChkPntFile, "checkpoint_file",
1125
1126 "File name to store checkpoint file",
1127 CArgDescriptions::eOutputFile);
1128 // ASCII matrix file
1129 arg_desc.AddOptionalKey(kArgAsciiPssmOutputFile, "ascii_mtx_file",
1130 "File name to store ASCII version of PSSM",
1131 CArgDescriptions::eOutputFile);
1132
1133 arg_desc.AddFlag(kArgSaveLastPssm, "Save PSSM after the last database "
1134 "search");
1135 arg_desc.AddFlag(kArgSaveAllPssms, "Save PSSM after each iteration "
1136 "(file name is given in -save_pssm or "
1137 "-save_ascii_pssm options)");
1138
1139 if (!m_IsDeltaBlast) {
1140 vector<string> msa_exclusions;
1141 msa_exclusions.push_back(kArgPSIInputChkPntFile);
1142 msa_exclusions.push_back(kArgQuery);
1143 msa_exclusions.push_back(kArgQueryLocation);
1144 // pattern and MSA is not supported
1145 msa_exclusions.push_back(kArgPHIPatternFile);
1146 arg_desc.SetCurrentGroup("");
1147 arg_desc.SetCurrentGroup("");
1148
1149 // MSA restart file
1150 arg_desc.SetCurrentGroup("PSSM engine options");
1151 arg_desc.AddOptionalKey(kArgMSAInputFile, "align_restart",
1152 "File name of multiple sequence alignment to "
1153 "restart PSI-BLAST",
1154 CArgDescriptions::eInputFile);
1155 ITERATE(vector<string>, exclusion, msa_exclusions) {
1156 arg_desc.SetDependency(kArgMSAInputFile,
1157 CArgDescriptions::eExcludes,
1158 *exclusion);
1159 }
1160
1161 arg_desc.AddOptionalKey(kArgMSAMasterIndex, "index",
1162 "Ordinal number (1-based index) of the sequence"
1163 " to use as a master in the multiple sequence "
1164 "alignment. If not provided, the first sequence"
1165 " in the multiple sequence alignment will be "
1166 "used", CArgDescriptions::eInteger);
1167 arg_desc.SetConstraint(kArgMSAMasterIndex,
1168 new CArgAllowValuesGreaterThanOrEqual(1));
1169 ITERATE(vector<string>, exclusion, msa_exclusions) {
1170 arg_desc.SetDependency(kArgMSAMasterIndex,
1171 CArgDescriptions::eExcludes,
1172 *exclusion);
1173 }
1174 arg_desc.SetDependency(kArgMSAMasterIndex,
1175 CArgDescriptions::eRequires,
1176 kArgMSAInputFile);
1177 arg_desc.SetDependency(kArgMSAMasterIndex,
1178 CArgDescriptions::eExcludes,
1179 kArgIgnoreMsaMaster);
1180
1181 arg_desc.AddFlag(kArgIgnoreMsaMaster,
1182 "Ignore the master sequence when creating PSSM", true);
1183
1184 vector<string> ignore_pssm_master_exclusions;
1185 ignore_pssm_master_exclusions.push_back(kArgMSAMasterIndex);
1186 ignore_pssm_master_exclusions.push_back(kArgPSIInputChkPntFile);
1187 ignore_pssm_master_exclusions.push_back(kArgQuery);
1188 ignore_pssm_master_exclusions.push_back(kArgQueryLocation);
1189 ITERATE(vector<string>, exclusion, msa_exclusions) {
1190 arg_desc.SetDependency(kArgIgnoreMsaMaster,
1191 CArgDescriptions::eExcludes,
1192 *exclusion);
1193 }
1194 arg_desc.SetDependency(kArgIgnoreMsaMaster,
1195 CArgDescriptions::eRequires,
1196 kArgMSAInputFile);
1197
1198 // PSI-BLAST checkpoint
1199 arg_desc.AddOptionalKey(kArgPSIInputChkPntFile, "psi_chkpt_file",
1200 "PSI-BLAST checkpoint file",
1201 CArgDescriptions::eInputFile);
1202 }
1203 }
1204
1205 if (!m_IsDeltaBlast) {
1206 arg_desc.SetDependency(kArgPSIInputChkPntFile,
1207 CArgDescriptions::eExcludes,
1208 kArgQuery);
1209 arg_desc.SetDependency(kArgPSIInputChkPntFile,
1210 CArgDescriptions::eExcludes,
1211 kArgQueryLocation);
1212 }
1213 arg_desc.SetCurrentGroup("");
1214 }
1215
1216 CRef<CPssmWithParameters>
x_CreatePssmFromMsa(CNcbiIstream & input_stream,CBlastOptions & opt,bool save_ascii_pssm,unsigned int msa_master_idx,bool ignore_pssm_tmplt_seq)1217 CPsiBlastArgs::x_CreatePssmFromMsa(CNcbiIstream& input_stream,
1218 CBlastOptions& opt, bool save_ascii_pssm,
1219 unsigned int msa_master_idx,
1220 bool ignore_pssm_tmplt_seq)
1221 {
1222 // FIXME get these from CBlastOptions
1223 CPSIBlastOptions psiblast_opts;
1224 PSIBlastOptionsNew(&psiblast_opts);
1225 psiblast_opts->nsg_compatibility_mode = ignore_pssm_tmplt_seq;
1226
1227 CPSIDiagnosticsRequest diags(PSIDiagnosticsRequestNewEx(save_ascii_pssm));
1228 CPsiBlastInputClustalW pssm_input(input_stream, *psiblast_opts,
1229 opt.GetMatrixName(), diags, NULL, 0,
1230 opt.GetGapOpeningCost(),
1231 opt.GetGapExtensionCost(),
1232 msa_master_idx);
1233 CPssmEngine pssm_engine(&pssm_input);
1234 return pssm_engine.Run();
1235 }
1236
1237 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1238 CPsiBlastArgs::ExtractAlgorithmOptions(const CArgs& args,
1239 CBlastOptions& opt)
1240 {
1241 if (m_DbTarget == eProteinDb) {
1242 if (args[kArgPSINumIterations]) {
1243 if(m_NumIterations == 1)
1244 m_NumIterations = args[kArgPSINumIterations].AsInteger();
1245 }
1246
1247 if (args.Exist(kArgSaveLastPssm) && args[kArgSaveLastPssm] &&
1248 (!args.Exist(kArgPSIOutputChkPntFile) ||
1249 !args[kArgPSIOutputChkPntFile]) &&
1250 (!args.Exist(kArgAsciiPssmOutputFile) ||
1251 !args[kArgAsciiPssmOutputFile])) {
1252
1253 NCBI_THROW(CInputException, eInvalidInput, kArgSaveLastPssm +
1254 " option requires " + kArgPSIOutputChkPntFile + " or " +
1255 kArgAsciiPssmOutputFile);
1256 }
1257
1258 if (args.Exist(kArgSaveAllPssms) && args[kArgSaveAllPssms] &&
1259 (!args.Exist(kArgPSIOutputChkPntFile) ||
1260 !args[kArgPSIOutputChkPntFile]) &&
1261 (!args.Exist(kArgAsciiPssmOutputFile) ||
1262 !args[kArgAsciiPssmOutputFile])) {
1263
1264 NCBI_THROW(CInputException, eInvalidInput, kArgSaveAllPssms +
1265 " option requires " + kArgPSIOutputChkPntFile + " or " +
1266 kArgAsciiPssmOutputFile);
1267 }
1268
1269 const bool kSaveAllPssms
1270 = args.Exist(kArgSaveAllPssms) && args[kArgSaveAllPssms];
1271 if (args.Exist(kArgPSIOutputChkPntFile) &&
1272 args[kArgPSIOutputChkPntFile]) {
1273 m_CheckPointOutput.Reset
1274 (new CAutoOutputFileReset
1275 (args[kArgPSIOutputChkPntFile].AsString(), kSaveAllPssms));
1276 }
1277 const bool kSaveAsciiPssm = args[kArgAsciiPssmOutputFile];
1278 if (kSaveAsciiPssm) {
1279 m_AsciiMatrixOutput.Reset
1280 (new CAutoOutputFileReset
1281 (args[kArgAsciiPssmOutputFile].AsString(), kSaveAllPssms));
1282 }
1283 if (args.Exist(kArgMSAInputFile) && args[kArgMSAInputFile]) {
1284 CNcbiIstream& in = args[kArgMSAInputFile].AsInputFile();
1285 unsigned int msa_master_idx = 0;
1286 if (args[kArgMSAMasterIndex]) {
1287 msa_master_idx = args[kArgMSAMasterIndex].AsInteger() - 1;
1288 }
1289 m_Pssm = x_CreatePssmFromMsa(in, opt, kSaveAsciiPssm,
1290 msa_master_idx,
1291 args[kArgIgnoreMsaMaster]);
1292 }
1293 if (!m_IsDeltaBlast) {
1294 opt.SetIgnoreMsaMaster(args[kArgIgnoreMsaMaster]);
1295 }
1296
1297 if (args.Exist(kArgSaveLastPssm) && args[kArgSaveLastPssm]) {
1298 m_SaveLastPssm = true;
1299 }
1300 }
1301
1302 if (args.Exist(kArgPSIInputChkPntFile) && args[kArgPSIInputChkPntFile]) {
1303 CNcbiIstream& in = args[kArgPSIInputChkPntFile].AsInputFile();
1304 _ASSERT(m_Pssm.Empty());
1305 m_Pssm.Reset(new CPssmWithParameters);
1306 try {
1307 switch (CFormatGuess().Format(in)) {
1308 case CFormatGuess::eBinaryASN:
1309 in >> MSerial_AsnBinary >> *m_Pssm;
1310 break;
1311 case CFormatGuess::eTextASN:
1312 in >> MSerial_AsnText >> *m_Pssm;
1313 break;
1314 case CFormatGuess::eXml:
1315 in >> MSerial_Xml >> *m_Pssm;
1316 break;
1317 default:
1318 NCBI_THROW(CInputException, eInvalidInput,
1319 "Unsupported format for PSSM");
1320 }
1321 } catch (const CSerialException&) {
1322 string msg("Unrecognized format for PSSM in ");
1323 msg += args[kArgPSIInputChkPntFile].AsString() + " (must be ";
1324 msg += "PssmWithParameters)";
1325 NCBI_THROW(CInputException, eInvalidInput, msg);
1326 }
1327 _ASSERT(m_Pssm.NotEmpty());
1328 }
1329 }
1330
1331 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1332 CPhiBlastArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1333 {
1334 arg_desc.SetCurrentGroup("PHI-BLAST options");
1335
1336 arg_desc.AddOptionalKey(kArgPHIPatternFile, "file",
1337 "File name containing pattern to search",
1338 CArgDescriptions::eInputFile);
1339 arg_desc.SetDependency(kArgPHIPatternFile,
1340 CArgDescriptions::eExcludes,
1341 kArgPSIInputChkPntFile);
1342
1343 arg_desc.SetCurrentGroup("");
1344 }
1345
1346 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1347 CPhiBlastArgs::ExtractAlgorithmOptions(const CArgs& args,
1348 CBlastOptions& opt)
1349 {
1350 if (args.Exist(kArgPHIPatternFile) && args[kArgPHIPatternFile]) {
1351 CNcbiIstream& in = args[kArgPHIPatternFile].AsInputFile();
1352 in.clear();
1353 in.seekg(0);
1354 char buffer[4096];
1355 string line;
1356 string pattern;
1357 string name;
1358 while (in.getline(buffer, 4096)) {
1359 line = buffer;
1360 string ltype = line.substr(0, 2);
1361 if (ltype == "ID")
1362 name = line.substr(4);
1363 else if (ltype == "PA")
1364 pattern = line.substr(4);
1365 }
1366 if (!pattern.empty())
1367 opt.SetPHIPattern(pattern.c_str(),
1368 (Blast_QueryIsNucleotide(opt.GetProgramType())
1369 ? true : false));
1370 else
1371 NCBI_THROW(CInputException, eInvalidInput,
1372 "PHI pattern not read");
1373 }
1374 }
1375
1376 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1377 CKBlastpArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1378 {
1379 arg_desc.SetCurrentGroup("KBLASTP options");
1380 arg_desc.AddDefaultKey(kArgJDistance, "threshold", "Jaccard Distance",
1381 CArgDescriptions::eDouble, kDfltArgJDistance);
1382 arg_desc.AddDefaultKey(kArgMinHits, "minhits", "minimal number of LSH matches",
1383 CArgDescriptions::eInteger, kDfltArgMinHits);
1384 arg_desc.AddDefaultKey(kArgCandidateSeqs, "candidates", "Number of candidate sequences to process with BLAST",
1385 CArgDescriptions::eInteger, kDfltArgCandidateSeqs);
1386 }
1387
1388 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1389 CKBlastpArgs::ExtractAlgorithmOptions(const CArgs& args,
1390 CBlastOptions& opt)
1391 {
1392 if (args.Exist(kArgJDistance))
1393 m_JDistance = args[kArgJDistance].AsDouble();
1394 if (args.Exist(kArgMinHits))
1395 m_MinHits = args[kArgMinHits].AsInteger();
1396 if (args.Exist(kArgCandidateSeqs))
1397 m_CandidateSeqs = args[kArgCandidateSeqs].AsInteger();
1398 }
1399
1400
1401 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1402 CDeltaBlastArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1403 {
1404 arg_desc.SetCurrentGroup("DELTA-BLAST options");
1405
1406 arg_desc.AddDefaultKey(kArgRpsDb, "database_name", "BLAST domain "
1407 "database name", CArgDescriptions::eString,
1408 kDfltArgRpsDb);
1409
1410 arg_desc.AddFlag(kArgShowDomainHits, "Show domain hits");
1411 arg_desc.SetDependency(kArgShowDomainHits, CArgDescriptions::eExcludes,
1412 kArgRemote);
1413 arg_desc.SetDependency(kArgShowDomainHits, CArgDescriptions::eExcludes,
1414 kArgSubject);
1415 }
1416
1417 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1418 CDeltaBlastArgs::ExtractAlgorithmOptions(const CArgs& args,
1419 CBlastOptions& opt)
1420 {
1421 m_DomainDb.Reset(new CSearchDatabase(args[kArgRpsDb].AsString(),
1422 CSearchDatabase::eBlastDbIsProtein));
1423
1424 if (args.Exist(kArgShowDomainHits)) {
1425 m_ShowDomainHits = args[kArgShowDomainHits];
1426 }
1427 }
1428
1429 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1430 CMappingArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1431 {
1432
1433 arg_desc.SetCurrentGroup("Mapping options");
1434 arg_desc.AddDefaultKey(kArgScore, "num", "Cutoff score for accepting "
1435 "alignments. Can be expressed as a number or a "
1436 "function of read length: "
1437 "L,b,a for a * length + b.\n"
1438 "Zero means that the cutoff score will be equal to:\n"
1439 "read length, if read length <= 20,\n"
1440 "20, if read length <= 30,\n"
1441 "read length - 10, if read length <= 50,\n"
1442 "40, otherwise.",
1443 CArgDescriptions::eString, "0");
1444 arg_desc.AddOptionalKey(kArgMaxEditDist, "num", "Cutoff edit distance for "
1445 "accepting an alignment\nDefault = unlimited",
1446 CArgDescriptions::eInteger);
1447 arg_desc.AddDefaultKey(kArgSplice, "TF", "Search for spliced alignments",
1448 CArgDescriptions::eBoolean, "true");
1449 arg_desc.AddDefaultKey(kArgRefType, "type", "Type of the reference: "
1450 "genome or transcriptome",
1451 CArgDescriptions::eString, "genome");
1452 arg_desc.SetConstraint(kArgRefType,
1453 &(*new CArgAllow_Strings, "genome", "transcriptome"));
1454
1455 arg_desc.SetCurrentGroup("Query filtering options");
1456 arg_desc.AddDefaultKey(kArgLimitLookup, "TF", "Remove word seeds with "
1457 "high frequency in the searched database",
1458 CArgDescriptions::eBoolean, "true");
1459 arg_desc.AddDefaultKey(kArgMaxDbWordCount, "num", "Words that appear more "
1460 "than this number of times in the database will be"
1461 " masked in the lookup table",
1462 CArgDescriptions::eInteger,
1463 NStr::IntToString(MAX_DB_WORD_COUNT_MAPPER));
1464 arg_desc.AddDefaultKey(kArgLookupStride, "num", "Number of words to skip "
1465 "after collecting one while creating a lookup table",
1466 CArgDescriptions::eInteger, "0");
1467
1468 arg_desc.SetCurrentGroup("");
1469 }
1470
1471
1472 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1473 CMappingArgs::ExtractAlgorithmOptions(const CArgs& args,
1474 CBlastOptions& opt)
1475 {
1476 if (args.Exist(kArgScore) && args[kArgScore]) {
1477
1478 string s = args[kArgScore].AsString();
1479 // score cutoff may be defined as a liner function of query length:
1480 // L,0.0,0.6 ...
1481 if (s[0] == 'L') {
1482 list<string> tokens;
1483 NStr::Split(s, ",", tokens);
1484 vector<double> coeffs;
1485 if (tokens.size() < 3) {
1486 NCBI_THROW(CInputException, eInvalidInput,
1487 (string)"Incorrectly formatted score function: " +
1488 s + ". It should be of the form 'L,b,a' for ax + b,"
1489 "a, b must be numbers");
1490 }
1491 auto it = tokens.begin();
1492 ++it;
1493 try {
1494 for (; it != tokens.end(); ++it) {
1495 coeffs.push_back(NStr::StringToDouble(*it));
1496 }
1497 }
1498 catch (CException& e) {
1499 NCBI_THROW(CInputException, eInvalidInput,
1500 (string)"Incorrectly formatted score function: " +
1501 s + ". It should be of the form 'L,b,a' for ax + b,"
1502 " a, b must be real numbers");
1503 }
1504 opt.SetCutoffScoreCoeffs(coeffs);
1505 }
1506 else {
1507 // ... or a numerical constant
1508 try {
1509 opt.SetCutoffScore(NStr::StringToInt(s));
1510 }
1511 catch (CException&) {
1512 NCBI_THROW(CInputException, eInvalidInput,
1513 (string)"Incorrectly formatted score threshold: " +
1514 s + ". It must be either an integer or a linear "
1515 "function in the form: L,b,a for ax + b, a and b "
1516 "must be real numbers");
1517 }
1518 }
1519 }
1520
1521 if (args.Exist(kArgMaxEditDist) && args[kArgMaxEditDist]) {
1522 opt.SetMaxEditDistance(args[kArgMaxEditDist].AsInteger());
1523 }
1524
1525 if (args.Exist(kArgSplice) && args[kArgSplice]) {
1526 opt.SetSpliceAlignments(args[kArgSplice].AsBoolean());
1527 }
1528
1529 string ref_type = "genome";
1530 if (args.Exist(kArgRefType) && args[kArgRefType]) {
1531 ref_type = args[kArgRefType].AsString();
1532 }
1533
1534 if (args.Exist(kArgLimitLookup) && args[kArgLimitLookup]) {
1535 opt.SetLookupDbFilter(args[kArgLimitLookup].AsBoolean());
1536 }
1537 else {
1538 opt.SetLookupDbFilter(ref_type == "genome");
1539 }
1540
1541 if (args.Exist(kArgMaxDbWordCount) && args[kArgMaxDbWordCount]) {
1542 opt.SetMaxDbWordCount(args[kArgMaxDbWordCount].AsInteger());
1543 }
1544
1545 if (args.Exist(kArgLookupStride) && args[kArgLookupStride]) {
1546 opt.SetLookupTableStride(args[kArgLookupStride].AsInteger());
1547 }
1548 }
1549
1550
1551 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1552 CIgBlastArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1553 {
1554 arg_desc.SetCurrentGroup("Ig-BLAST options");
1555 const static char suffix[] = "VDJ";
1556 const static int df_num_align[3] = {3,3,3};
1557 int num_genes = (m_IsProtein) ? 1 : 3;
1558
1559
1560 for (int gene=0; gene<num_genes; ++gene) {
1561 // Subject sequence input
1562 /* TODO disabled for now
1563 string arg_sub = kArgGLSubject;
1564 arg_sub.push_back(suffix[gene]);
1565 arg_desc.AddOptionalKey(arg_sub , "filename",
1566 "Germline subject sequence to align",
1567 CArgDescriptions::eInputFile);
1568 */
1569 // Germline database file name
1570 string arg_db = kArgGLDatabase;
1571 arg_db.push_back(suffix[gene]);
1572 arg_desc.AddOptionalKey(arg_db, "germline_database_name",
1573 "Germline database name",
1574 CArgDescriptions::eString);
1575 //arg_desc.SetDependency(arg_db, CArgDescriptions::eExcludes, arg_sub);
1576 // Number of alignments to show
1577 string arg_na = kArgGLNumAlign;
1578 arg_na.push_back(suffix[gene]);
1579 arg_desc.AddDefaultKey(arg_na, "int_value",
1580 "Number of Germline sequences to show alignments for",
1581 CArgDescriptions::eInteger,
1582 NStr::IntToString(df_num_align[gene]));
1583 //arg_desc.SetConstraint(arg_na,
1584 // new CArgAllowValuesBetween(0, 4));
1585 // Seqidlist
1586 arg_desc.AddOptionalKey(arg_db + "_seqidlist", "filename",
1587 "Restrict search of germline database to list of SeqIds's",
1588 CArgDescriptions::eString);
1589 }
1590
1591 if (!m_IsProtein) {
1592 arg_desc.AddOptionalKey(kArgGLChainType, "filename",
1593 "File containing the coding frame start positions for sequences in germline J database",
1594 CArgDescriptions::eString);
1595
1596 arg_desc.AddOptionalKey(kArgMinDMatch, "min_D_match",
1597 "Required minimal consecutive nucleotide base matches for D genes ",
1598 CArgDescriptions::eInteger);
1599 arg_desc.SetConstraint(kArgMinDMatch,
1600 new CArgAllowValuesGreaterThanOrEqual(5));
1601
1602 arg_desc.AddDefaultKey(kArgVPenalty, "V_penalty",
1603 "Penalty for a nucleotide mismatch in V gene",
1604 CArgDescriptions::eInteger, "-1");
1605 arg_desc.SetConstraint(kArgVPenalty,
1606 new CArgAllowValuesBetween(-4, 0));
1607
1608
1609 arg_desc.AddDefaultKey(kArgDPenalty, "D_penalty",
1610 "Penalty for a nucleotide mismatch in D gene",
1611 CArgDescriptions::eInteger, "-2");
1612
1613 arg_desc.SetConstraint(kArgDPenalty,
1614 new CArgAllowValuesBetween(-5, 0));
1615
1616 arg_desc.AddDefaultKey(kArgJPenalty, "J_penalty",
1617 "Penalty for a nucleotide mismatch in J gene",
1618 CArgDescriptions::eInteger, "-2");
1619
1620 arg_desc.SetConstraint(kArgJPenalty,
1621 new CArgAllowValuesBetween(-4, 0));
1622
1623 arg_desc.AddDefaultKey(kArgNumClonotype, "num_clonotype",
1624 "Number of top clonotypes to show ",
1625 CArgDescriptions::eInteger, "100");
1626 arg_desc.SetConstraint(kArgNumClonotype,
1627 new CArgAllowValuesGreaterThanOrEqual(0));
1628
1629 arg_desc.AddOptionalKey(kArgClonotypeFile, "clonotype_out",
1630 "Output file name for clonotype info",
1631 CArgDescriptions::eOutputFile);
1632
1633 arg_desc.AddFlag(kArgDetectOverlap, "Allow V(D)J genes to overlap. This option is active only when D_penalty and J_penalty are set to -4 and -3, respectively", true);
1634
1635
1636 }
1637
1638 arg_desc.AddDefaultKey(kArgGLOrigin, "germline_origin",
1639 "The organism for your query sequence. Supported organisms include human, mouse, rat, rabbit and rhesus_monkey for Ig and human and mouse for TCR. Custom organism is also supported but you need to supply your own germline annotations (see IgBLAST web site for details)",
1640 CArgDescriptions::eString, "human");
1641
1642 arg_desc.AddDefaultKey(kArgGLDomainSystem, "domain_system",
1643 "Domain system to be used for segment annotation",
1644 CArgDescriptions::eString, "imgt");
1645 arg_desc.SetConstraint(kArgGLDomainSystem, &(*new CArgAllow_Strings, "kabat", "imgt"));
1646
1647 arg_desc.AddDefaultKey(kArgIgSeqType, "sequence_type",
1648 "Specify Ig or T cell receptor sequence",
1649 CArgDescriptions::eString, "Ig");
1650 arg_desc.SetConstraint(kArgIgSeqType, &(*new CArgAllow_Strings, "Ig", "TCR"));
1651
1652
1653 arg_desc.AddFlag(kArgGLFocusV, "Should the search only be for V segment (effective only for non-germline database search using -db option)?", true);
1654
1655 arg_desc.AddFlag(kArgExtendAlign5end, "Extend V gene alignment at 5' end", true);
1656
1657 arg_desc.AddFlag(kArgExtendAlign3end, "Extend J gene alignment at 3' end", true);
1658
1659 arg_desc.AddDefaultKey(kArgMinVLength, "Min_V_Length",
1660 "Minimal required V gene length",
1661 CArgDescriptions::eInteger, "9");
1662
1663 arg_desc.SetConstraint(kArgMinVLength,
1664 new CArgAllowValuesGreaterThanOrEqual(9));
1665
1666 if (! m_IsProtein) {
1667 arg_desc.AddDefaultKey(kArgMinJLength, "Min_J_Length",
1668 "Minimal required J gene length",
1669 CArgDescriptions::eInteger, "0");
1670
1671 arg_desc.SetConstraint(kArgMinJLength,
1672 new CArgAllowValuesGreaterThanOrEqual(0));
1673 }
1674
1675 if (! m_IsProtein) {
1676 arg_desc.AddFlag(kArgTranslate, "Show translated alignments", true);
1677 }
1678
1679 arg_desc.SetCurrentGroup("");
1680 }
1681
s_RegisterOMDataLoader(CRef<CSeqDB> db_handle)1682 static string s_RegisterOMDataLoader(CRef<CSeqDB> db_handle)
1683 { // the blast formatter requires that the database coexist in
1684 // the same scope with the query sequences
1685 CRef<CObjectManager> om = CObjectManager::GetInstance();
1686 CBlastDbDataLoader::RegisterInObjectManager(*om, db_handle, true,
1687 CObjectManager::eDefault,
1688 CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
1689 CBlastDbDataLoader::SBlastDbParam param(db_handle);
1690 string retval(CBlastDbDataLoader::GetLoaderNameFromArgs(param));
1691 _TRACE("Registering " << retval << " at priority " <<
1692 (int)CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
1693 return retval;
1694 }
1695
1696 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opts)1697 CIgBlastArgs::ExtractAlgorithmOptions(const CArgs& args,
1698 CBlastOptions& opts)
1699 {
1700 string paths[3];
1701 CNcbiEnvironment env;
1702 paths[0] = CDirEntry::NormalizePath(CDir::GetCwd(), eFollowLinks);
1703 paths[1] = CDirEntry::NormalizePath(env.Get("IGDATA"), eFollowLinks);
1704 CNcbiApplication* app = CNcbiApplication::Instance();
1705 if (app) {
1706 const CNcbiRegistry& registry = app->GetConfig();
1707 paths[2] = CDirEntry::NormalizePath(registry.Get("BLAST","IGDATA"), eFollowLinks);
1708 } else {
1709 #if defined(NCBI_OS_DARWIN)
1710 paths[2] = "/usr/local/ncbi/igblast/data";
1711 #else
1712 paths[2] = paths[0];
1713 #endif
1714 }
1715
1716 m_IgOptions.Reset(new CIgBlastOptions());
1717
1718 CBlastDatabaseArgs::EMoleculeType mol_type = Blast_SubjectIsNucleotide(opts.GetProgramType())
1719 ? CSearchDatabase::eBlastDbIsNucleotide
1720 : CSearchDatabase::eBlastDbIsProtein;
1721
1722 m_IgOptions->m_IsProtein = (mol_type == CSearchDatabase::eBlastDbIsProtein);
1723 m_IgOptions->m_Origin = args[kArgGLOrigin].AsString();
1724 m_IgOptions->m_DomainSystem = args[kArgGLDomainSystem].AsString();
1725 m_IgOptions->m_FocusV = args.Exist(kArgGLFocusV) ? args[kArgGLFocusV] : false;
1726 m_IgOptions->m_ExtendAlign5end = args.Exist(kArgExtendAlign5end) ? args[kArgExtendAlign5end] : false;
1727 m_IgOptions->m_ExtendAlign3end = args.Exist(kArgExtendAlign3end) ? args[kArgExtendAlign3end] : false;
1728 m_IgOptions->m_DetectOverlap = args.Exist(kArgDetectOverlap) ? args[kArgDetectOverlap] : false;
1729 m_IgOptions->m_MinVLength = args[kArgMinVLength].AsInteger();
1730 if (args.Exist(kArgMinJLength) && args[kArgMinJLength]) {
1731 m_IgOptions->m_MinJLength = args[kArgMinJLength].AsInteger();
1732 } else {
1733 m_IgOptions->m_MinJLength = 0;
1734 }
1735 m_IgOptions->m_Translate = args.Exist(kArgTranslate) ? args[kArgTranslate] : false;
1736 if (!m_IsProtein) {
1737 string aux_file = (args.Exist(kArgGLChainType) && args[kArgGLChainType])
1738 ? args[kArgGLChainType].AsString()
1739 : m_IgOptions->m_Origin + "_gl.aux";
1740 m_IgOptions->m_AuxFilename = aux_file;
1741 for (int i=0; i<3; i++) {
1742 string aux_path = CDirEntry::ConcatPath(paths[i], aux_file);
1743 CDirEntry entry(aux_path);
1744 if (entry.Exists() && entry.IsFile()) {
1745 m_IgOptions->m_AuxFilename = aux_path;
1746 break;
1747 }
1748 }
1749 }
1750
1751 _ASSERT(m_IsProtein == m_IgOptions->m_IsProtein);
1752
1753 m_Scope.Reset(new CScope(*CObjectManager::GetInstance()));
1754
1755 // default germline database name for annotation
1756 for (int i=0; i<3; i++) {
1757 string int_data = CDirEntry::ConcatPath(paths[i], "internal_data");
1758 CDirEntry entry(int_data);
1759 if (entry.Exists() && entry.IsDir()) {
1760 m_IgOptions->m_IgDataPath = int_data;
1761 break;
1762 }
1763 }
1764
1765 m_IgOptions->m_SequenceType = "Ig";
1766 if (args.Exist(kArgIgSeqType) && args[kArgIgSeqType]) {
1767 m_IgOptions->m_SequenceType = args[kArgIgSeqType].AsString();
1768 }
1769
1770 string df_db_name = CDirEntry::ConcatPath(
1771 CDirEntry::ConcatPath(m_IgOptions->m_IgDataPath,
1772 m_IgOptions->m_Origin), m_IgOptions->m_Origin +
1773 ((m_IgOptions->m_SequenceType == "TCR")?"_TR":"") + "_V");
1774 CRef<CSearchDatabase> db(new CSearchDatabase(df_db_name, mol_type));
1775 m_IgOptions->m_Db[3].Reset(new CLocalDbAdapter(*db));
1776 try {
1777 db->GetSeqDb();
1778 } catch(...) {
1779 NCBI_THROW(CInputException, eInvalidInput,
1780 "Germline annotation database " + df_db_name + " could not be found in [internal_data] directory");
1781 }
1782
1783 m_IgOptions->m_Min_D_match = 5;
1784 if (args.Exist(kArgMinDMatch) && args[kArgMinDMatch]) {
1785 m_IgOptions->m_Min_D_match = args[kArgMinDMatch].AsInteger();
1786 }
1787
1788 if (args.Exist(kArgVPenalty) && args[kArgVPenalty]) {
1789 m_IgOptions->m_V_penalty = args[kArgVPenalty].AsInteger();
1790 }
1791
1792 if (args.Exist(kArgDPenalty) && args[kArgDPenalty]) {
1793 m_IgOptions->m_D_penalty = args[kArgDPenalty].AsInteger();
1794 }
1795
1796 if (args.Exist(kArgJPenalty) && args[kArgJPenalty]) {
1797 m_IgOptions->m_J_penalty = args[kArgJPenalty].AsInteger();
1798 }
1799
1800 CRef<CBlastOptionsHandle> opts_hndl;
1801 if (m_IgOptions->m_IsProtein) {
1802 opts_hndl.Reset(CBlastOptionsFactory::Create(eBlastp));
1803 } else {
1804 opts_hndl.Reset(CBlastOptionsFactory::Create(eBlastn));
1805 }
1806
1807 const static char suffix[] = "VDJ";
1808 int num_genes = (m_IsProtein) ? 1: 3;
1809 for (int gene=0; gene< num_genes; ++gene) {
1810 string arg_sub = kArgGLSubject;
1811 string arg_db = kArgGLDatabase;
1812 string arg_na = kArgGLNumAlign;
1813
1814 arg_sub.push_back(suffix[gene]);
1815 arg_db.push_back(suffix[gene]);
1816 arg_na.push_back(suffix[gene]);
1817
1818 m_IgOptions->m_NumAlign[gene] = args[arg_na].AsInteger();
1819
1820 if (args.Exist(arg_sub) && args[arg_sub]) {
1821 CNcbiIstream& subj_input_stream = args[arg_sub].AsInputFile();
1822 TSeqRange subj_range;
1823
1824 const bool parse_deflines = args.Exist(kArgParseDeflines)
1825 ? bool(args[kArgParseDeflines])
1826 : kDfltArgParseDeflines;
1827 const bool use_lcase_masks = args.Exist(kArgUseLCaseMasking)
1828 ? bool(args[kArgUseLCaseMasking])
1829 : kDfltArgUseLCaseMasking;
1830 CRef<blast::CBlastQueryVector> subjects;
1831 CRef<CScope> scope = ReadSequencesToBlast(subj_input_stream,
1832 m_IgOptions->m_IsProtein,
1833 subj_range, parse_deflines,
1834 use_lcase_masks, subjects);
1835 m_Scope->AddScope(*scope,
1836 CBlastDatabaseArgs::kSubjectsDataLoaderPriority);
1837 CRef<IQueryFactory> sub_seqs(
1838 new blast::CObjMgr_QueryFactory(*subjects));
1839 m_IgOptions->m_Db[gene].Reset(new CLocalDbAdapter(
1840 sub_seqs, opts_hndl));
1841 } else {
1842 string gl_db_name = m_IgOptions->m_Origin + "_gl_";
1843 gl_db_name.push_back(suffix[gene]);
1844 string db_name = (args.Exist(arg_db) && args[arg_db])
1845 ? args[arg_db].AsString() : gl_db_name;
1846 db.Reset(new CSearchDatabase(db_name, mol_type));
1847
1848 if (args.Exist(arg_db + "_seqidlist") && args[arg_db + "_seqidlist"]) {
1849 string fn(SeqDB_ResolveDbPath(args[arg_db + "_seqidlist"].AsString()));
1850 db->SetGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn,
1851 CSeqDBFileGiList::eSiList)));
1852 }
1853
1854 m_IgOptions->m_Db[gene].Reset(new CLocalDbAdapter(*db));
1855 m_Scope->AddDataLoader(s_RegisterOMDataLoader(db->GetSeqDb()));
1856 }
1857 }
1858 }
1859
1860 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1861 CQueryOptionsArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1862 {
1863
1864 arg_desc.SetCurrentGroup("Query filtering options");
1865 // lowercase masking
1866 arg_desc.AddFlag(kArgUseLCaseMasking,
1867 "Use lower case filtering in query and subject sequence(s)?", true);
1868
1869 arg_desc.SetCurrentGroup("Input query options");
1870 // query location
1871 arg_desc.AddOptionalKey(kArgQueryLocation, "range",
1872 "Location on the query sequence in 1-based offsets "
1873 "(Format: start-stop)",
1874 CArgDescriptions::eString);
1875
1876 if ( !m_QueryCannotBeNucl) {
1877 // search strands
1878 arg_desc.AddDefaultKey(kArgStrand, "strand",
1879 "Query strand(s) to search against database/subject",
1880 CArgDescriptions::eString, kDfltArgStrand);
1881 arg_desc.SetConstraint(kArgStrand, &(*new CArgAllow_Strings,
1882 kDfltArgStrand, "plus", "minus"));
1883 }
1884
1885 arg_desc.SetCurrentGroup("Miscellaneous options");
1886 arg_desc.AddFlag(kArgParseDeflines,
1887 "Should the query and subject defline(s) be parsed?", true);
1888
1889 arg_desc.SetCurrentGroup("");
1890 }
1891
1892 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1893 CQueryOptionsArgs::ExtractAlgorithmOptions(const CArgs& args,
1894 CBlastOptions& opt)
1895 {
1896 // Get the strand
1897 {
1898 m_Strand = eNa_strand_unknown;
1899
1900 if (!Blast_QueryIsProtein(opt.GetProgramType())) {
1901
1902 if (args.Exist(kArgStrand) && args[kArgStrand]) {
1903 const string& kStrand = args[kArgStrand].AsString();
1904 if (kStrand == "both") {
1905 m_Strand = eNa_strand_both;
1906 } else if (kStrand == "plus") {
1907 m_Strand = eNa_strand_plus;
1908 } else if (kStrand == "minus") {
1909 m_Strand = eNa_strand_minus;
1910 } else {
1911 abort();
1912 }
1913 }
1914 else {
1915 m_Strand = eNa_strand_both;
1916 }
1917 }
1918 }
1919
1920 // set the sequence range
1921 if (args.Exist(kArgQueryLocation) && args[kArgQueryLocation]) {
1922 m_Range = ParseSequenceRange(args[kArgQueryLocation].AsString(),
1923 "Invalid specification of query location");
1924 }
1925
1926 m_UseLCaseMask = args.Exist(kArgUseLCaseMasking) &&
1927 static_cast<bool>(args[kArgUseLCaseMasking]);
1928 m_ParseDeflines = args.Exist(kArgParseDeflines) &&
1929 static_cast<bool>(args[kArgParseDeflines]);
1930 }
1931
1932 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)1933 CMapperQueryOptionsArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
1934 {
1935
1936 arg_desc.SetCurrentGroup("Query filtering options");
1937 // lowercase masking
1938 arg_desc.AddFlag(kArgUseLCaseMasking,
1939 "Use lower case filtering in subject sequence(s)?", true);
1940 arg_desc.AddDefaultKey(kArgQualityFilter, "TF", "Reject low quality "
1941 "sequences ", CArgDescriptions::eBoolean, "true");
1942
1943 arg_desc.SetCurrentGroup("Input query options");
1944 arg_desc.AddDefaultKey(kArgInputFormat, "format", "Input format for "
1945 "sequences", CArgDescriptions::eString, "fasta");
1946 arg_desc.SetConstraint(kArgInputFormat, &(*new CArgAllow_Strings,
1947 "fasta", "fastc", "fastq",
1948 "asn1", "asn1b"));
1949 arg_desc.AddFlag(kArgPaired, "Input query sequences are paired", true);
1950 arg_desc.AddOptionalKey(kArgQueryMate, "infile", "FASTA file with "
1951 "mates for query sequences (if given in "
1952 "another file)", CArgDescriptions::eInputFile);
1953 arg_desc.SetDependency(kArgQueryMate, CArgDescriptions::eRequires,
1954 kArgQuery);
1955
1956 arg_desc.AddOptionalKey(kArgSraAccession, "accession",
1957 "Comma-separated SRA accessions",
1958 CArgDescriptions::eString);
1959 arg_desc.SetDependency(kArgSraAccession, CArgDescriptions::eExcludes,
1960 kArgQuery);
1961 arg_desc.SetDependency(kArgSraAccession, CArgDescriptions::eExcludes,
1962 kArgInputFormat);
1963
1964 arg_desc.AddOptionalKey(kArgSraAccessionBatch, "file",
1965 "File with a list of SRA accessions, one per line",
1966 CArgDescriptions::eInputFile);
1967 arg_desc.SetDependency(kArgSraAccessionBatch, CArgDescriptions::eExcludes,
1968 kArgSraAccession);
1969 arg_desc.SetDependency(kArgSraAccessionBatch, CArgDescriptions::eExcludes,
1970 kArgQuery);
1971 arg_desc.SetDependency(kArgSraAccessionBatch, CArgDescriptions::eExcludes,
1972 kArgInputFormat);
1973
1974 arg_desc.SetCurrentGroup("Miscellaneous options");
1975 arg_desc.AddDefaultKey(kArgParseDeflines, "TF", "Should the query and "
1976 "subject defline(s) be parsed?",
1977 CArgDescriptions::eBoolean, "true");
1978
1979 arg_desc.AddFlag(kArgEnableSraCache, "Enable SRA caching in local files");
1980 arg_desc.SetDependency(kArgEnableSraCache, CArgDescriptions::eRequires,
1981 kArgSraAccession);
1982
1983
1984 arg_desc.SetCurrentGroup("");
1985 }
1986
1987 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)1988 CMapperQueryOptionsArgs::ExtractAlgorithmOptions(const CArgs& args,
1989 CBlastOptions& opt)
1990 {
1991 CQueryOptionsArgs::ExtractAlgorithmOptions(args, opt);
1992
1993 if (args.Exist(kArgPaired) && args[kArgPaired]) {
1994 opt.SetPaired(true);
1995 m_IsPaired = true;
1996 }
1997
1998 if (args.Exist(kArgInputFormat) && args[kArgInputFormat]) {
1999 if (args[kArgInputFormat].AsString() == "fasta") {
2000 m_InputFormat = eFasta;
2001 }
2002 else if (args[kArgInputFormat].AsString() == "fastc") {
2003 m_InputFormat = eFastc;
2004 }
2005 else if (args[kArgInputFormat].AsString() == "fastq") {
2006 m_InputFormat = eFastq;
2007 }
2008 else if (args[kArgInputFormat].AsString() == "asn1") {
2009 m_InputFormat = eASN1text;
2010 }
2011 else if (args[kArgInputFormat].AsString() == "asn1b") {
2012 m_InputFormat = eASN1bin;
2013 }
2014 else {
2015 NCBI_THROW(CInputException, eInvalidInput,
2016 "Unexpected input format: " +
2017 args[kArgInputFormat].AsString());
2018 }
2019 }
2020
2021 if (m_InputFormat == eFastc) {
2022 // FASTC format always has pairs in a single file
2023 opt.SetPaired(true);
2024 m_IsPaired = true;
2025 }
2026
2027 if (args.Exist(kArgQualityFilter) && args[kArgQualityFilter]) {
2028 opt.SetReadQualityFiltering(args[kArgQualityFilter].AsBoolean());
2029 }
2030
2031 if (args.Exist(kArgQueryMate) && args[kArgQueryMate]) {
2032 // create a decompress stream is the file is compressed
2033 // (the primary query file is handeled by CStdCmdLieArgs object)
2034 if (NStr::EndsWith(args[kArgQueryMate].AsString(), ".gz",
2035 NStr::eNocase)) {
2036 m_DecompressIStream.reset(new CDecompressIStream(
2037 args[kArgQueryMate].AsInputFile(),
2038 CDecompressIStream::eGZipFile));
2039 m_MateInputStream = m_DecompressIStream.get();
2040 }
2041 else {
2042 m_MateInputStream = &args[kArgQueryMate].AsInputFile();
2043 }
2044
2045 // queries have pairs in the mate stream
2046 opt.SetPaired(true);
2047 m_IsPaired = true;
2048 }
2049
2050 if ((args.Exist(kArgSraAccession) && args[kArgSraAccession]) ||
2051 (args.Exist(kArgSraAccessionBatch) && args[kArgSraAccessionBatch])) {
2052
2053 if (args[kArgSraAccession]) {
2054 // accessions given in the command-line
2055 NStr::Split((CTempString)args[kArgSraAccession].AsString(), ",",
2056 m_SraAccessions);
2057 }
2058 else {
2059 // accessions given in a file
2060 while (!args[kArgSraAccessionBatch].AsInputFile().eof()) {
2061 string line;
2062 args[kArgSraAccessionBatch].AsInputFile() >> line;
2063 if (!line.empty()) {
2064 m_SraAccessions.push_back(line);
2065 }
2066 }
2067 }
2068
2069 if (m_SraAccessions.empty()) {
2070 NCBI_THROW(CInputException, eInvalidInput,
2071 "No SRA accessions provided");
2072 }
2073
2074 m_InputFormat = eSra;
2075 // assume SRA input is paired, that information for each read is in
2076 // SRA database, this option will trigger checking for pairs
2077 opt.SetPaired(true);
2078 m_IsPaired = true;
2079 }
2080
2081 if (args.Exist(kArgEnableSraCache) && args[kArgEnableSraCache]) {
2082 m_EnableSraCache = true;
2083 }
2084 }
2085
2086
2087
CBlastDatabaseArgs(bool request_mol_type,bool is_rpsblast,bool is_igblast,bool is_mapper,bool is_kblast)2088 CBlastDatabaseArgs::CBlastDatabaseArgs(bool request_mol_type /* = false */,
2089 bool is_rpsblast /* = false */,
2090 bool is_igblast /* = false */,
2091 bool is_mapper /* = false */,
2092 bool is_kblast /* = false */)
2093 : m_RequestMoleculeType(request_mol_type),
2094 m_IsRpsBlast(is_rpsblast),
2095 m_IsIgBlast(is_igblast),
2096 m_IsProtein(true),
2097 m_IsMapper(is_mapper),
2098 m_IsKBlast(is_kblast),
2099 m_SupportsDatabaseMasking(false),
2100 m_SupportIPGFiltering(false)
2101 {}
2102
2103 bool
HasBeenSet(const CArgs & args)2104 CBlastDatabaseArgs::HasBeenSet(const CArgs& args)
2105 {
2106 if ( (args.Exist(kArgDb) && args[kArgDb].HasValue()) ||
2107 (args.Exist(kArgSubject) && args[kArgSubject].HasValue()) ) {
2108 return true;
2109 }
2110 return false;
2111 }
2112
2113 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)2114 CBlastDatabaseArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
2115 {
2116 arg_desc.SetCurrentGroup("General search options");
2117 // database filename
2118 if (m_IsIgBlast){
2119 arg_desc.AddOptionalKey(kArgDb, "database_name", "Optional additional database name",
2120 CArgDescriptions::eString);
2121 } else {
2122 arg_desc.AddOptionalKey(kArgDb, "database_name", "BLAST database name",
2123 CArgDescriptions::eString);
2124 }
2125
2126 arg_desc.SetCurrentGroup("");
2127
2128 if (m_RequestMoleculeType) {
2129 arg_desc.AddKey(kArgDbType, "database_type",
2130 "BLAST database molecule type",
2131 CArgDescriptions::eString);
2132 arg_desc.SetConstraint(kArgDbType,
2133 &(*new CArgAllow_Strings, "prot", "nucl"));
2134 }
2135
2136 vector<string> database_args;
2137 database_args.push_back(kArgDb);
2138 database_args.push_back(kArgGiList);
2139 database_args.push_back(kArgSeqIdList);
2140 database_args.push_back(kArgNegativeGiList);
2141 database_args.push_back(kArgNegativeSeqidList);
2142 database_args.push_back(kArgTaxIdList);
2143 database_args.push_back(kArgTaxIdListFile);
2144 database_args.push_back(kArgNegativeTaxIdList);
2145 database_args.push_back(kArgNegativeTaxIdListFile);
2146 if (m_SupportIPGFiltering) {
2147 database_args.push_back(kArgIpgList);
2148 database_args.push_back(kArgNegativeIpgList);
2149 }
2150 if (m_SupportsDatabaseMasking) {
2151 database_args.push_back(kArgDbSoftMask);
2152 database_args.push_back(kArgDbHardMask);
2153 }
2154
2155 // DB size
2156 if (!m_IsMapper) {
2157 arg_desc.SetCurrentGroup("Statistical options");
2158 arg_desc.AddOptionalKey(kArgDbSize, "num_letters",
2159 "Effective length of the database ",
2160 CArgDescriptions::eInt8);
2161 }
2162
2163 arg_desc.SetCurrentGroup("Restrict search or results");
2164 // GI list
2165 if (!m_IsRpsBlast && !m_IsIgBlast) {
2166 arg_desc.AddOptionalKey(kArgGiList, "filename",
2167 "Restrict search of database to list of GIs",
2168 CArgDescriptions::eString);
2169 // SeqId list
2170 arg_desc.AddOptionalKey(kArgSeqIdList, "filename",
2171 "Restrict search of database to list of SeqIDs",
2172 CArgDescriptions::eString);
2173 // Negative GI list
2174 arg_desc.AddOptionalKey(kArgNegativeGiList, "filename",
2175 "Restrict search of database to everything"
2176 " except the specified GIs",
2177 CArgDescriptions::eString);
2178
2179 // Negative SeqId list
2180 arg_desc.AddOptionalKey(kArgNegativeSeqidList, "filename",
2181 "Restrict search of database to everything"
2182 " except the specified SeqIDs",
2183 CArgDescriptions::eString);
2184
2185 // Tax ID list
2186 arg_desc.AddOptionalKey(kArgTaxIdList, "taxids",
2187 "Restrict search of database to include only "
2188 "the specified taxonomy IDs "
2189 "(multiple IDs delimited by ',')",
2190 CArgDescriptions::eString);
2191 arg_desc.AddOptionalKey(kArgNegativeTaxIdList, "taxids",
2192 "Restrict search of database to everything "
2193 "except the specified taxonomy IDs "
2194 "(multiple IDs delimited by ',')",
2195 CArgDescriptions::eString);
2196 // Tax ID list file
2197 arg_desc.AddOptionalKey(kArgTaxIdListFile, "filename",
2198 "Restrict search of database to include only "
2199 "the specified taxonomy IDs",
2200 CArgDescriptions::eString);
2201 arg_desc.AddOptionalKey(kArgNegativeTaxIdListFile, "filename",
2202 "Restrict search of database to everything "
2203 "except the specified taxonomy IDs",
2204 CArgDescriptions::eString);
2205
2206 if (m_SupportIPGFiltering) {
2207 arg_desc.AddOptionalKey(kArgIpgList, "filename",
2208 "Restrict search of database to list of IPGs",
2209 CArgDescriptions::eString);
2210
2211 // Negative IPG list
2212 arg_desc.AddOptionalKey(kArgNegativeIpgList, "filename",
2213 "Restrict search of database to everything"
2214 " except the specified IPGs",
2215 CArgDescriptions::eString);
2216 }
2217 // N.B.: all restricting options are mutually exclusive
2218 const vector<string> kBlastDBFilteringOptions = {
2219 kArgGiList,
2220 kArgSeqIdList,
2221 kArgTaxIdList,
2222 kArgTaxIdListFile,
2223
2224 kArgNegativeGiList,
2225 kArgNegativeSeqidList,
2226 kArgNegativeTaxIdList,
2227 kArgNegativeTaxIdListFile,
2228
2229 };
2230 for (size_t i = 0; i < kBlastDBFilteringOptions.size(); i++) {
2231 for (size_t j = i+1; j < kBlastDBFilteringOptions.size(); j++) {
2232 arg_desc.SetDependency(kBlastDBFilteringOptions[i], CArgDescriptions::eExcludes,
2233 kBlastDBFilteringOptions[j]);
2234 }
2235 }
2236
2237 // For now, disable pairing -remote with either -gilist or
2238 // -negative_gilist as this is not implemented in the BLAST server
2239 for (const string& s: kBlastDBFilteringOptions) {
2240 arg_desc.SetDependency(kArgRemote, CArgDescriptions::eExcludes, s);
2241 }
2242 }
2243
2244 // Entrez Query
2245 if (!m_IsMapper) {
2246 arg_desc.AddOptionalKey(kArgEntrezQuery, "entrez_query",
2247 "Restrict search with the given Entrez query",
2248 CArgDescriptions::eString);
2249
2250 // Entrez query currently requires the -remote option
2251 arg_desc.SetDependency(kArgEntrezQuery, CArgDescriptions::eRequires,
2252 kArgRemote);
2253 }
2254
2255
2256 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
2257 (!defined(NCBI_COMPILER_MIPSPRO)) )
2258 // Masking of database
2259 if (m_SupportsDatabaseMasking) {
2260 arg_desc.AddOptionalKey(kArgDbSoftMask,
2261 "filtering_algorithm",
2262 "Filtering algorithm ID to apply to the BLAST database as soft "
2263 "masking",
2264 CArgDescriptions::eString);
2265 arg_desc.SetDependency(kArgDbSoftMask, CArgDescriptions::eExcludes,
2266 kArgDbHardMask);
2267
2268 arg_desc.AddOptionalKey(kArgDbHardMask,
2269 "filtering_algorithm",
2270 "Filtering algorithm ID to apply to the BLAST database as hard "
2271 "masking",
2272 CArgDescriptions::eString);
2273 }
2274 #endif
2275
2276 // There is no RPS-BLAST 2 sequences
2277 if ( !m_IsRpsBlast && !m_IsKBlast && !m_IsIgBlast) {
2278 arg_desc.SetCurrentGroup("BLAST-2-Sequences options");
2279 // subject sequence input (for bl2seq)
2280 arg_desc.AddOptionalKey(kArgSubject, "subject_input_file",
2281 "Subject sequence(s) to search",
2282 CArgDescriptions::eInputFile);
2283 ITERATE(vector<string>, dbarg, database_args) {
2284 arg_desc.SetDependency(kArgSubject, CArgDescriptions::eExcludes,
2285 *dbarg);
2286 }
2287
2288 // subject location
2289 arg_desc.AddOptionalKey(kArgSubjectLocation, "range",
2290 "Location on the subject sequence in 1-based offsets "
2291 "(Format: start-stop)",
2292 CArgDescriptions::eString);
2293 ITERATE(vector<string>, dbarg, database_args) {
2294 arg_desc.SetDependency(kArgSubjectLocation,
2295 CArgDescriptions::eExcludes,
2296 *dbarg);
2297 }
2298 // Because Blast4-subject does not support Seq-locs, specifying a
2299 // subject range does not work for remote searches
2300 arg_desc.SetDependency(kArgSubjectLocation,
2301 CArgDescriptions::eExcludes, kArgRemote);
2302 }
2303
2304 arg_desc.SetCurrentGroup("");
2305 }
2306
2307
2308
2309
s_GetTaxIDList(const string & in,bool isFile,bool isNegativeList,CRef<CSearchDatabase> & sdb)2310 static void s_GetTaxIDList(const string & in, bool isFile, bool isNegativeList, CRef<CSearchDatabase> & sdb)
2311 {
2312 vector<string> ids;
2313 if (isFile) {
2314 string filename(SeqDB_ResolveDbPath(in));
2315 if(filename == kEmptyStr) {
2316 NCBI_THROW(CInputException, eInvalidInput,
2317 "File is not acessible: "+ in );
2318 }
2319 CNcbiIfstream instream(filename.c_str());
2320 CStreamLineReader reader(instream);
2321 while (!reader.AtEOF()) {
2322 reader.ReadLine();
2323 ids.push_back(reader.GetCurrentLine());
2324 }
2325 }
2326 else {
2327 NStr::Split(in, ",", ids, NStr::fSplit_Tokenize);
2328 }
2329
2330 set<TTaxId> tax_ids;
2331 for(unsigned int i=0; i < ids.size(); i++) {
2332 try {
2333 if(NStr::IsBlank(ids[i])){
2334 continue;
2335 }
2336 tax_ids.insert(NStr::StringToNumeric<TTaxId>(ids[i], NStr::fAllowLeadingSpaces | NStr::fAllowTrailingSpaces));
2337 }
2338 catch(CException &){
2339 NCBI_THROW(CInputException, eInvalidInput, "Invalid taxidlist file ");
2340 }
2341 }
2342
2343 CRef<CSeqDBGiList> taxid_list(new CSeqDBGiList());
2344 taxid_list->AddTaxIds(tax_ids);
2345 if(isNegativeList) {
2346 sdb->SetNegativeGiList(taxid_list.GetPointer());
2347 }
2348 else {
2349 sdb->SetGiList(taxid_list.GetPointer());
2350 }
2351
2352 }
2353
2354
2355 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opts)2356 CBlastDatabaseArgs::ExtractAlgorithmOptions(const CArgs& args,
2357 CBlastOptions& opts)
2358 {
2359 EMoleculeType mol_type = Blast_SubjectIsNucleotide(opts.GetProgramType())
2360 ? CSearchDatabase::eBlastDbIsNucleotide
2361 : CSearchDatabase::eBlastDbIsProtein;
2362 m_IsProtein = (mol_type == CSearchDatabase::eBlastDbIsProtein);
2363
2364 if (args.Exist(kArgDb) && args[kArgDb]) {
2365
2366 m_SearchDb.Reset(new CSearchDatabase(args[kArgDb].AsString(),
2367 mol_type));
2368
2369 if (args.Exist(kArgGiList) && args[kArgGiList]) {
2370 string fn(SeqDB_ResolveDbPath(args[kArgGiList].AsString()));
2371 m_SearchDb->SetGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn)));
2372
2373 } else if (args.Exist(kArgNegativeGiList) && args[kArgNegativeGiList]) {
2374 string fn(SeqDB_ResolveDbPath(args[kArgNegativeGiList].AsString()));
2375 m_SearchDb->SetNegativeGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn)));
2376
2377 } else if (args.Exist(kArgSeqIdList) && args[kArgSeqIdList]) {
2378 string fn(SeqDB_ResolveDbPath(args[kArgSeqIdList].AsString()));
2379 m_SearchDb->SetGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn,
2380 CSeqDBFileGiList::eSiList)));
2381 } else if (args.Exist(kArgNegativeSeqidList) && args[kArgNegativeSeqidList]) {
2382 string fn(SeqDB_ResolveDbPath(args[kArgNegativeSeqidList].AsString()));
2383 m_SearchDb->SetNegativeGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn,CSeqDBFileGiList::eSiList)));
2384 } else if (args.Exist(kArgTaxIdList) && args[kArgTaxIdList]) {
2385 s_GetTaxIDList(args[kArgTaxIdList].AsString(), false, false, m_SearchDb);
2386
2387 } else if (args.Exist(kArgTaxIdListFile) && args[kArgTaxIdListFile]) {
2388 s_GetTaxIDList(args[kArgTaxIdListFile].AsString(), true, false, m_SearchDb);
2389
2390 } else if (args.Exist(kArgNegativeTaxIdList) && args[kArgNegativeTaxIdList]) {
2391 s_GetTaxIDList(args[kArgNegativeTaxIdList].AsString(), false, true, m_SearchDb);
2392
2393 } else if (args.Exist(kArgNegativeTaxIdListFile) && args[kArgNegativeTaxIdListFile]) {
2394 s_GetTaxIDList(args[kArgNegativeTaxIdListFile].AsString(), true, true, m_SearchDb);
2395
2396 } else if (args.Exist(kArgIpgList) && args[kArgIpgList]) {
2397 string fn(SeqDB_ResolveDbPath(args[kArgIpgList].AsString()));
2398 m_SearchDb->SetGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn, CSeqDBFileGiList::ePigList)));
2399 } else if (args.Exist(kArgNegativeIpgList) && args[kArgNegativeIpgList]) {
2400 string fn(SeqDB_ResolveDbPath(args[kArgNegativeIpgList].AsString()));
2401 m_SearchDb->SetNegativeGiList(CRef<CSeqDBGiList> (new CSeqDBFileGiList(fn, CSeqDBFileGiList::ePigList)));
2402
2403 }
2404
2405 if (args.Exist(kArgEntrezQuery) && args[kArgEntrezQuery])
2406 m_SearchDb->SetEntrezQueryLimitation(args[kArgEntrezQuery].AsString());
2407
2408 #if ((!defined(NCBI_COMPILER_WORKSHOP) || (NCBI_COMPILER_VERSION > 550)) && \
2409 (!defined(NCBI_COMPILER_MIPSPRO)) )
2410 if (args.Exist(kArgDbSoftMask) && args[kArgDbSoftMask]) {
2411 m_SearchDb->SetFilteringAlgorithm(args[kArgDbSoftMask].AsString(), eSoftSubjMasking);
2412 } else if (args.Exist(kArgDbHardMask) && args[kArgDbHardMask]) {
2413 m_SearchDb->SetFilteringAlgorithm(args[kArgDbHardMask].AsString(), eHardSubjMasking);
2414 }
2415 #endif
2416 } else if (args.Exist(kArgSubject) && args[kArgSubject]) {
2417
2418 CNcbiIstream* subj_input_stream = NULL;
2419 unique_ptr<CDecompressIStream> decompress_stream;
2420 if (m_IsMapper &&
2421 NStr::EndsWith(args[kArgSubject].AsString(), ".gz", NStr::eNocase)) {
2422 decompress_stream.reset(
2423 new CDecompressIStream(args[kArgSubject].AsInputFile(),
2424 CDecompressIStream::eGZipFile));
2425 subj_input_stream = decompress_stream.get();
2426 }
2427 else {
2428 subj_input_stream = &args[kArgSubject].AsInputFile();
2429 }
2430
2431 TSeqRange subj_range;
2432 if (args.Exist(kArgSubjectLocation) && args[kArgSubjectLocation]) {
2433 subj_range =
2434 ParseSequenceRange(args[kArgSubjectLocation].AsString(),
2435 "Invalid specification of subject location");
2436 }
2437
2438 const bool parse_deflines = args.Exist(kArgParseDeflines)
2439 ? args[kArgParseDeflines].AsBoolean()
2440 : kDfltArgParseDeflines;
2441 const bool use_lcase_masks = args.Exist(kArgUseLCaseMasking)
2442 ? bool(args[kArgUseLCaseMasking])
2443 : kDfltArgUseLCaseMasking;
2444 CRef<blast::CBlastQueryVector> subjects;
2445 m_Scope = ReadSequencesToBlast(*subj_input_stream, IsProtein(),
2446 subj_range, parse_deflines,
2447 use_lcase_masks, subjects, m_IsMapper);
2448 m_Subjects.Reset(new blast::CObjMgr_QueryFactory(*subjects));
2449
2450 } else if (!m_IsIgBlast){
2451 // IgBlast permits use of germline database
2452 NCBI_THROW(CInputException, eInvalidInput,
2453 "Either a BLAST database or subject sequence(s) must be specified");
2454 }
2455
2456 if (opts.GetEffectiveSearchSpace() != 0) {
2457 // no need to set any other options, as this trumps them
2458 return;
2459 }
2460
2461 if (args.Exist(kArgDbSize) && args[kArgDbSize]) {
2462 opts.SetDbLength(args[kArgDbSize].AsInt8());
2463 }
2464
2465 }
2466
2467 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)2468 CFormattingArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
2469 {
2470 arg_desc.SetCurrentGroup("Formatting options");
2471
2472 string kOutputFormatDescription = string(
2473 "alignment view options:\n"
2474 " 0 = Pairwise,\n"
2475 " 1 = Query-anchored showing identities,\n"
2476 " 2 = Query-anchored no identities,\n"
2477 " 3 = Flat query-anchored showing identities,\n"
2478 " 4 = Flat query-anchored no identities,\n"
2479 " 5 = BLAST XML,\n"
2480 " 6 = Tabular,\n"
2481 " 7 = Tabular with comment lines,\n"
2482 " 8 = Seqalign (Text ASN.1),\n"
2483 " 9 = Seqalign (Binary ASN.1),\n"
2484 " 10 = Comma-separated values,\n"
2485 " 11 = BLAST archive (ASN.1),\n"
2486 " 12 = Seqalign (JSON),\n"
2487 " 13 = Multiple-file BLAST JSON,\n"
2488 " 14 = Multiple-file BLAST XML2,\n"
2489 " 15 = Single-file BLAST JSON,\n"
2490 " 16 = Single-file BLAST XML2");
2491
2492 if(m_FormatFlags & eIsSAM) {
2493 kOutputFormatDescription += ",\n 17 = Sequence Alignment/Map (SAM)";
2494 }
2495 kOutputFormatDescription += ",\n 18 = Organism Report\n\n";
2496 if(m_FormatFlags & eIsSAM) {
2497 kOutputFormatDescription +=
2498 "Options 6, 7, 10 and 17 "
2499 "can be additionally configured to produce\n"
2500 "a custom format specified by space delimited format specifiers,\n"
2501 "or in the case of options 6, 7, and 10, by a token specified\n"
2502 "by the delim keyword. E.g.: \"17 delim=@ qacc sacc score\".\n"
2503 "The delim keyword must appear after the numeric output format\n"
2504 "specification.\n"
2505 "The supported format specifiers for options 6, 7 and 10 are:\n";
2506 }
2507 else {
2508 kOutputFormatDescription +=
2509 "Options 6, 7 and 10 "
2510 "can be additionally configured to produce\n"
2511 "a custom format specified by space delimited format specifiers,\n"
2512 "or by a token specified by the delim keyword.\n"
2513 " E.g.: \"10 delim=@ qacc sacc score\".\n"
2514 "The delim keyword must appear after the numeric output format\n"
2515 "specification.\n"
2516 "The supported format specifiers are:\n";
2517 }
2518
2519 kOutputFormatDescription += DescribeTabularOutputFormatSpecifiers() + string("\n");
2520
2521 if(m_FormatFlags & eIsSAM) {
2522 kOutputFormatDescription +=
2523 "The supported format specifier for option 17 is:\n" +
2524 DescribeSAMOutputFormatSpecifiers();
2525 }
2526
2527
2528 int dft_outfmt = kDfltArgOutputFormat;
2529
2530 // Igblast shows extra column of gaps
2531 if (m_IsIgBlast) {
2532 kOutputFormatDescription = string(
2533 "alignment view options:\n"
2534 " 3 = Flat query-anchored, show identities,\n"
2535 " 4 = Flat query-anchored, no identities,\n"
2536 " 7 = Tabular with comment lines\n"
2537 " 19 = Rearrangement summary report (AIRR format)\n\n"
2538 "Options 7 can be additionally configured to produce\n"
2539 "a custom format specified by space delimited format specifiers.\n"
2540 "The supported format specifiers are:\n") +
2541 DescribeTabularOutputFormatSpecifiers(true) +
2542 string("\n");
2543 dft_outfmt = 3;
2544 }
2545
2546 // alignment view
2547 arg_desc.AddDefaultKey(kArgOutputFormat, "format",
2548 kOutputFormatDescription,
2549 CArgDescriptions::eString,
2550 NStr::IntToString(dft_outfmt));
2551
2552 // show GIs in deflines
2553 arg_desc.AddFlag(kArgShowGIs, "Show NCBI GIs in deflines?", true);
2554
2555 // number of one-line descriptions to display
2556 arg_desc.AddOptionalKey(kArgNumDescriptions, "int_value",
2557 "Number of database sequences to show one-line "
2558 "descriptions for\n"
2559 "Not applicable for outfmt > 4\n"
2560 "Default = `"+ NStr::IntToString(m_DfltNumDescriptions)+ "'",
2561 CArgDescriptions::eInteger);
2562 arg_desc.SetConstraint(kArgNumDescriptions,
2563 new CArgAllowValuesGreaterThanOrEqual(0));
2564
2565 // number of alignments per DB sequence
2566 arg_desc.AddOptionalKey(kArgNumAlignments, "int_value",
2567 "Number of database sequences to show alignments for\n"
2568 "Default = `" + NStr::IntToString(m_DfltNumAlignments) + "'",
2569 CArgDescriptions::eInteger );
2570 arg_desc.SetConstraint(kArgNumAlignments,
2571 new CArgAllowValuesGreaterThanOrEqual(0));
2572
2573 arg_desc.AddOptionalKey(kArgLineLength, "line_length",
2574 "Line length for formatting alignments\n"
2575 "Not applicable for outfmt > 4\n"
2576 "Default = `"+ NStr::NumericToString(align_format::kDfltLineLength) + "'",
2577 CArgDescriptions::eInteger);
2578 arg_desc.SetConstraint(kArgLineLength,
2579 new CArgAllowValuesGreaterThanOrEqual(1));
2580
2581 if(!m_IsIgBlast){
2582 // Produce HTML?
2583 arg_desc.AddFlag(kArgProduceHtml, "Produce HTML output?", true);
2584
2585
2586 arg_desc.AddOptionalKey(kArgSortHits, "sort_hits",
2587 "Sorting option for hits:\n"
2588 "alignment view options:\n"
2589 " 0 = Sort by evalue,\n"
2590 " 1 = Sort by bit score,\n"
2591 " 2 = Sort by total score,\n"
2592 " 3 = Sort by percent identity,\n"
2593 " 4 = Sort by query coverage\n"
2594 "Not applicable for outfmt > 4\n",
2595 CArgDescriptions::eInteger);
2596 arg_desc.SetConstraint(kArgSortHits,
2597 new CArgAllowValuesBetween(CAlignFormatUtil::eEvalue,
2598 CAlignFormatUtil::eQueryCoverage,
2599 true));
2600
2601 arg_desc.AddOptionalKey(kArgSortHSPs, "sort_hsps",
2602 "Sorting option for hps:\n"
2603 " 0 = Sort by hsp evalue,\n"
2604 " 1 = Sort by hsp score,\n"
2605 " 2 = Sort by hsp query start,\n"
2606 " 3 = Sort by hsp percent identity,\n"
2607 " 4 = Sort by hsp subject start\n"
2608 "Not applicable for outfmt != 0\n",
2609 CArgDescriptions::eInteger);
2610 arg_desc.SetConstraint(kArgSortHSPs,
2611 new CArgAllowValuesBetween(CAlignFormatUtil::eHspEvalue,
2612 CAlignFormatUtil::eSubjectStart,
2613 true));
2614 /// Hit list size, listed here for convenience only
2615 arg_desc.SetCurrentGroup("Restrict search or results");
2616 arg_desc.AddOptionalKey(kArgMaxTargetSequences, "num_sequences",
2617 "Maximum number of aligned sequences to keep \n"
2618 "(value of 5 or more is recommended)\n"
2619 "Default = `" + NStr::IntToString(BLAST_HITLIST_SIZE) + "'",
2620 CArgDescriptions::eInteger);
2621 arg_desc.SetConstraint(kArgMaxTargetSequences,
2622 new CArgAllowValuesGreaterThanOrEqual(1));
2623 arg_desc.SetDependency(kArgMaxTargetSequences,
2624 CArgDescriptions::eExcludes,
2625 kArgNumDescriptions);
2626 arg_desc.SetDependency(kArgMaxTargetSequences,
2627 CArgDescriptions::eExcludes,
2628 kArgNumAlignments);
2629 }
2630 arg_desc.SetCurrentGroup("");
2631 }
2632
2633 bool
ArchiveFormatRequested(const CArgs & args) const2634 CFormattingArgs::ArchiveFormatRequested(const CArgs& args) const
2635 {
2636 EOutputFormat output_fmt;
2637 string ignore1, ignore2;
2638 ParseFormattingString(args, output_fmt, ignore1, ignore2);
2639 return (output_fmt == eArchiveFormat ? true : false);
2640 }
2641
2642
s_ValidateCustomDelim(string custom_fmt_spec,string customDelim)2643 static void s_ValidateCustomDelim(string custom_fmt_spec,string customDelim)
2644 {
2645 bool error = false;
2646 string checkfield;
2647 custom_fmt_spec = NStr::TruncateSpaces(custom_fmt_spec);
2648 if(custom_fmt_spec.empty()) return;
2649
2650 //Check if delim is already used
2651 const string kFieldsWithSemicolSeparator = "sallseqid staxids sscinames scomnames sblastnames sskingdoms";//sep = ";"
2652 const string kFramesField = "frames"; //sep = "/"
2653 const string kAllTitlesField ="salltitles"; //sep = "<>""
2654
2655 if(customDelim == ";") {
2656 vector <string> tokens;
2657 NStr::Split(kFieldsWithSemicolSeparator," ", tokens);
2658 for(size_t i = 0; i < tokens.size(); i++) {
2659 if(NStr::Find(custom_fmt_spec,tokens[i]) != NPOS) {
2660 checkfield = tokens[i];
2661 error = true;
2662 break;
2663 }
2664 }
2665 }
2666 else {
2667 if(customDelim == "/") {
2668 checkfield = kFramesField;
2669 }
2670 else if(customDelim == "<>") {
2671 checkfield = kAllTitlesField;
2672 }
2673 if(!checkfield.empty() && NStr::Find(custom_fmt_spec,checkfield) != NPOS) {
2674 error = true;
2675 }
2676 }
2677
2678 if(error) {
2679 string msg("Your custom record separator (" + customDelim + ") is also used by the format specifier (" + checkfield +
2680 ") to separate multiple entries. Please use a different record separator (delim keyword).");
2681 NCBI_THROW(CInputException, eInvalidInput, msg);
2682 }
2683 }
2684
2685 void
ParseFormattingString(const CArgs & args,EOutputFormat & fmt_type,string & custom_fmt_spec,string & custom_delim) const2686 CFormattingArgs::ParseFormattingString(const CArgs& args,
2687 EOutputFormat& fmt_type,
2688 string& custom_fmt_spec,
2689 string& custom_delim) const
2690 {
2691 custom_fmt_spec.clear();
2692 if (args[kArgOutputFormat]) {
2693 string fmt_choice =
2694 NStr::TruncateSpaces(args[kArgOutputFormat].AsString());
2695 string::size_type pos;
2696 if ( (pos = fmt_choice.find_first_of(' ')) != string::npos) {
2697 custom_fmt_spec.assign(fmt_choice, pos+1,
2698 fmt_choice.size()-(pos+1));
2699 fmt_choice.erase(pos);
2700 }
2701 if(!custom_fmt_spec.empty()) {
2702 if(NStr::StartsWith(custom_fmt_spec, "delim")) {
2703 vector <string> tokens;
2704 NStr::Split(custom_fmt_spec," ",tokens);
2705 if(tokens.size() > 0) {
2706 string tag;
2707 bool isValid = NStr::SplitInTwo(tokens[0],"=",tag,custom_delim);
2708 if(!isValid) {
2709 string msg("Delimiter format is invalid. Valid format is delim=<delimiter value>");
2710 NCBI_THROW(CInputException, eInvalidInput, msg);
2711 }
2712 else {
2713 custom_fmt_spec = NStr::Replace(custom_fmt_spec,tokens[0],"");
2714 }
2715 }
2716 }
2717 }
2718 int val = 0;
2719 try { val = NStr::StringToInt(fmt_choice); }
2720 catch (const CStringException&) { // probably a conversion error
2721 CNcbiOstrstream os;
2722 os << "'" << fmt_choice << "' is not a valid output format";
2723 string msg = CNcbiOstrstreamToString(os);
2724 NCBI_THROW(CInputException, eInvalidInput, msg);
2725 }
2726 if (val < 0 || val >= static_cast<int>(eEndValue)) {
2727 string msg("Formatting choice is out of range");
2728 throw std::out_of_range(msg);
2729 }
2730 if (m_IsIgBlast && (val != 3 && val != 4 && val != 7 && val != eAirrRearrangement)) {
2731 string msg("Formatting choice is not valid");
2732 throw std::out_of_range(msg);
2733 }
2734 fmt_type = static_cast<EOutputFormat>(val);
2735 if ( !(fmt_type == eTabular ||
2736 fmt_type == eTabularWithComments ||
2737 fmt_type == eCommaSeparatedValues ||
2738 fmt_type == eSAM) ) {
2739 custom_fmt_spec.clear();
2740 }
2741 }
2742 }
2743
2744
2745 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)2746 CFormattingArgs::ExtractAlgorithmOptions(const CArgs& args,
2747 CBlastOptions& opt)
2748 {
2749 ParseFormattingString(args, m_OutputFormat, m_CustomOutputFormatSpec,m_CustomDelim);
2750 if((m_OutputFormat == eSAM) && !(m_FormatFlags & eIsSAM) ){
2751 NCBI_THROW(CInputException, eInvalidInput,
2752 "SAM format is only applicable to blastn" );
2753 }
2754 if((m_OutputFormat == eAirrRearrangement) && !(m_FormatFlags & eIsAirrRearrangement) ){
2755 NCBI_THROW(CInputException, eInvalidInput,
2756 "AIRR rearrangement format is only applicable to igblastn" );
2757 }
2758 if (m_OutputFormat == eFasta) {
2759 NCBI_THROW(CInputException, eInvalidInput,
2760 "FASTA output format is only applicable to magicblast");
2761 }
2762 s_ValidateCustomDelim(m_CustomOutputFormatSpec,m_CustomDelim);
2763 m_ShowGis = static_cast<bool>(args[kArgShowGIs]);
2764 if(m_IsIgBlast){
2765 m_Html = false;
2766 } else {
2767 m_Html = static_cast<bool>(args[kArgProduceHtml]);
2768 }
2769 // Default hitlist size 500, value can be changed if import search strategy is used
2770 int hitlist_size = opt.GetHitlistSize();
2771
2772 // To preserve hitlist size in import search strategy > 500,
2773 // we need to increase the num_ descriptions and num_alignemtns
2774 if(hitlist_size > BLAST_HITLIST_SIZE )
2775 {
2776 if((!args.Exist(kArgNumDescriptions) || !args[kArgNumDescriptions]) &&
2777 (!args.Exist(kArgNumAlignments) || !args[kArgNumAlignments]) &&
2778 (m_OutputFormat <= eFlatQueryAnchoredNoIdentities)) {
2779 m_NumDescriptions = hitlist_size;
2780 m_NumAlignments = hitlist_size/ 2;
2781 return;
2782 }
2783 }
2784
2785 if(m_OutputFormat <= eFlatQueryAnchoredNoIdentities) {
2786
2787
2788 m_NumDescriptions = m_DfltNumDescriptions;
2789 m_NumAlignments = m_DfltNumAlignments;
2790
2791 if (args.Exist(kArgNumDescriptions) && args[kArgNumDescriptions]) {
2792 m_NumDescriptions = args[kArgNumDescriptions].AsInteger();
2793 }
2794
2795 if (args.Exist(kArgNumAlignments) && args[kArgNumAlignments]) {
2796 m_NumAlignments = args[kArgNumAlignments].AsInteger();
2797 }
2798
2799 if (args.Exist(kArgMaxTargetSequences) && args[kArgMaxTargetSequences]) {
2800 m_NumDescriptions = args[kArgMaxTargetSequences].AsInteger();
2801 m_NumAlignments = args[kArgMaxTargetSequences].AsInteger();
2802 hitlist_size = m_NumAlignments;
2803 }
2804
2805 // The If clause is for handling import_search_strategy hitlist size < 500
2806 // We want to preserve the hitlist size in iss if no formatting input is entered in cmdline
2807 // If formmating option(s) is entered than the iss hitlist size is overridden.
2808 // FIXME: does this work with import search strategies?
2809 if ((args.Exist(kArgNumDescriptions) && args[kArgNumDescriptions]) ||
2810 (args.Exist(kArgNumAlignments) && args[kArgNumAlignments])) {
2811 hitlist_size = max(m_NumDescriptions, m_NumAlignments);
2812 }
2813
2814 if (args[kArgLineLength]) {
2815 m_LineLength = args[kArgLineLength].AsInteger();
2816 }
2817 if(args.Exist(kArgSortHits) && args[kArgSortHits])
2818 {
2819 m_HitsSortOption = args[kArgSortHits].AsInteger();
2820 }
2821 }
2822 else
2823 {
2824 if (args.Exist(kArgNumDescriptions) && args[kArgNumDescriptions]) {
2825 ERR_POST(Warning << "The parameter -num_descriptions is ignored for "
2826 "output formats > 4 . Use -max_target_seqs "
2827 "to control output");
2828 }
2829
2830 if (args[kArgLineLength]) {
2831 ERR_POST(Warning << "The parameter -line_length is not applicable for "
2832 "output formats > 4 .");
2833 }
2834
2835 if (args.Exist(kArgMaxTargetSequences) && args[kArgMaxTargetSequences]) {
2836 hitlist_size = args[kArgMaxTargetSequences].AsInteger();
2837 }
2838 else if (args.Exist(kArgNumAlignments) && args[kArgNumAlignments]) {
2839 hitlist_size = args[kArgNumAlignments].AsInteger();
2840 }
2841
2842 m_NumDescriptions = hitlist_size;
2843 m_NumAlignments = hitlist_size;
2844
2845 if(args.Exist(kArgSortHits) && args[kArgSortHits]) {
2846 ERR_POST(Warning << "The parameter -sorthits is ignored for output formats > 4.");
2847 }
2848 }
2849
2850 if(hitlist_size < 5){
2851 ERR_POST(Warning << "Examining 5 or more matches is recommended");
2852 }
2853 opt.SetHitlistSize(hitlist_size);
2854
2855 if(args.Exist(kArgSortHSPs) && args[kArgSortHSPs])
2856 {
2857 int hspsSortOption = args[kArgSortHSPs].AsInteger();
2858 if(m_OutputFormat == ePairwise) {
2859 m_HspsSortOption = hspsSortOption;
2860 }
2861 else {
2862 ERR_POST(Warning << "The parameter -sorthsps is ignored for output formats != 0.");
2863 }
2864 }
2865 return;
2866 }
2867
2868
2869 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)2870 CMapperFormattingArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
2871 {
2872 arg_desc.SetCurrentGroup("Formatting options");
2873 string kOutputFormatDescription = string(
2874 "alignment view options:\n"
2875 "sam = SAM format,\n"
2876 "tabular = Tabular format,\n"
2877 "asn = text ASN.1\n");
2878
2879 string kUnalignedOutputFormatDescription = string(
2880 "format for reporting unaligned reads:\n"
2881 "sam = SAM format,\n"
2882 "tabular = Tabular format,\n"
2883 "fasta = sequences in FASTA format\n"
2884 "Default = same as ") +
2885 align_format::kArgOutputFormat;
2886
2887 arg_desc.AddDefaultKey(align_format::kArgOutputFormat, "format",
2888 kOutputFormatDescription,
2889 CArgDescriptions::eString,
2890 "sam");
2891
2892 set<string> allowed_formats = {"sam", "tabular", "asn"};
2893 arg_desc.SetConstraint(align_format::kArgOutputFormat,
2894 new CArgAllowStringSet(allowed_formats));
2895
2896 arg_desc.AddOptionalKey(kArgUnalignedFormat, "format",
2897 kUnalignedOutputFormatDescription,
2898 CArgDescriptions::eString);
2899
2900 set<string> allowed_unaligned_formats = {"sam", "tabular", "fasta"};
2901 arg_desc.SetConstraint(kArgUnalignedFormat,
2902 new CArgAllowStringSet(allowed_unaligned_formats));
2903
2904 arg_desc.SetDependency(kArgUnalignedFormat, CArgDescriptions::eRequires,
2905 kArgUnalignedOutput);
2906
2907
2908 arg_desc.AddFlag(kArgPrintMdTag, "Include MD tag in SAM report");
2909 arg_desc.AddFlag(kArgNoReadIdTrim, "Do not trim '.1', '/1', '.2', " \
2910 "or '/2' at the end of read ids for SAM format and" \
2911 "paired runs");
2912
2913 arg_desc.AddFlag(kArgNoUnaligned, "Do not report unaligned reads");
2914
2915 arg_desc.AddFlag(kArgNoDiscordant,
2916 "Suppress discordant alignments for paired reads");
2917
2918 arg_desc.SetCurrentGroup("");
2919 }
2920
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opt)2921 void CMapperFormattingArgs::ExtractAlgorithmOptions(const CArgs& args,
2922 CBlastOptions& opt)
2923 {
2924 if (args.Exist(align_format::kArgOutputFormat)) {
2925 string fmt_choice = args[align_format::kArgOutputFormat].AsString();
2926 if (fmt_choice == "sam") {
2927 m_OutputFormat = eSAM;
2928 }
2929 else if (fmt_choice == "tabular") {
2930 m_OutputFormat = eTabular;
2931 }
2932 else if (fmt_choice == "asn") {
2933 m_OutputFormat = eAsnText;
2934 }
2935 else {
2936 CNcbiOstrstream os;
2937 os << "'" << fmt_choice << "' is not a valid output format";
2938 string msg = CNcbiOstrstreamToString(os);
2939 NCBI_THROW(CInputException, eInvalidInput, msg);
2940 }
2941
2942 m_UnalignedOutputFormat = m_OutputFormat;
2943 }
2944
2945 if (args.Exist(kArgUnalignedFormat) && args[kArgUnalignedFormat]) {
2946 string fmt_choice = args[kArgUnalignedFormat].AsString();
2947 if (fmt_choice == "sam") {
2948 m_UnalignedOutputFormat = eSAM;
2949 }
2950 else if (fmt_choice == "tabular") {
2951 m_UnalignedOutputFormat = eTabular;
2952 }
2953 else if (fmt_choice == "fasta") {
2954 m_UnalignedOutputFormat = eFasta;
2955 }
2956 else {
2957 CNcbiOstrstream os;
2958 os << "'" << fmt_choice
2959 << "' is not a valid output format for unaligned reads";
2960 string msg = CNcbiOstrstreamToString(os);
2961 NCBI_THROW(CInputException, eInvalidInput, msg);
2962 }
2963 }
2964
2965 m_ShowGis = true;
2966 m_Html = false;
2967
2968 if (args.Exist(kArgNoReadIdTrim) && args[kArgNoReadIdTrim]) {
2969 m_TrimReadIds = false;
2970 }
2971
2972 if (args.Exist(kArgNoUnaligned) && args[kArgNoUnaligned]) {
2973 m_PrintUnaligned = false;
2974 }
2975
2976 if (args.Exist(kArgNoDiscordant) && args[kArgNoDiscordant]) {
2977 m_NoDiscordant = true;
2978 }
2979
2980 if (args.Exist(kArgFwdRev) && args[kArgFwdRev]) {
2981 m_FwdRev = true;
2982 }
2983
2984 if (args.Exist(kArgRevFwd) && args[kArgRevFwd]) {
2985 m_RevFwd = true;
2986 }
2987
2988 if (args.Exist(kArgFwdOnly) && args[kArgFwdOnly]) {
2989 m_FwdOnly = true;
2990 }
2991
2992 if (args.Exist(kArgRevOnly) && args[kArgRevOnly]) {
2993 m_RevOnly = true;
2994 }
2995
2996 if (args.Exist(kArgOnlyStrandSpecific) && args[kArgOnlyStrandSpecific]) {
2997 m_OnlyStrandSpecific = true;
2998 }
2999
3000 if (args.Exist(kArgPrintMdTag) && args[kArgPrintMdTag]) {
3001 m_PrintMdTag = true;
3002 }
3003
3004 // only the fast tabular format is able to show merged HSPs with
3005 // common query bases
3006 if (m_OutputFormat != eTabular) {
3007 // FIXME: This is a hack. Merging should be done by the formatter,
3008 // but is currently done by HSP stream writer. This is an easy
3009 // switch until merging is implemented properly.
3010 CNcbiEnvironment().Set("MAPPER_NO_OVERLAPPED_HSP_MERGE", "1");
3011 }
3012 }
3013
3014 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3015 CMTArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3016 {
3017 // number of threads
3018 arg_desc.SetCurrentGroup("Miscellaneous options");
3019 #ifdef NCBI_THREADS
3020 const int kMinValue = static_cast<int>(CThreadable::kMinNumThreads);
3021 const int kMaxValue = static_cast<int>(CSystemInfo::GetCpuCount());
3022 const int kDfltValue = m_NumThreads != CThreadable::kMinNumThreads
3023 ? std::min<int>(m_NumThreads, kMaxValue) : kMinValue;
3024
3025 arg_desc.AddDefaultKey(kArgNumThreads, "int_value",
3026 "Number of threads (CPUs) to use in the BLAST search",
3027 CArgDescriptions::eInteger,
3028 NStr::IntToString(kDfltValue));
3029 arg_desc.SetConstraint(kArgNumThreads,
3030 new CArgAllowValuesGreaterThanOrEqual(kMinValue));
3031 arg_desc.SetDependency(kArgNumThreads,
3032 CArgDescriptions::eExcludes,
3033 kArgRemote);
3034
3035 if (m_MTMode >= 0) {
3036 arg_desc.AddDefaultKey(kArgMTMode, "int_value",
3037 "Multi-thread mode to use in BLAST search:\n "
3038 "0 (auto) split by database \n "
3039 "1 split by queries",
3040 CArgDescriptions::eInteger,
3041 NStr::IntToString(0));
3042 arg_desc.SetConstraint(kArgMTMode,
3043 new CArgAllowValuesBetween(0, 1, true));
3044 arg_desc.SetDependency(kArgMTMode,
3045 CArgDescriptions::eRequires,
3046 kArgNumThreads);
3047 }
3048 /*
3049 arg_desc.SetDependency(kArgNumThreads,
3050 CArgDescriptions::eExcludes,
3051 kArgUseIndex);
3052 */
3053 #endif
3054 arg_desc.SetCurrentGroup("");
3055 }
3056
CMTArgs(const CArgs & args)3057 CMTArgs::CMTArgs(const CArgs& args)
3058 {
3059 x_ExtractAlgorithmOptions(args);
3060 }
3061
3062
3063 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions &)3064 CMTArgs::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& /* opts */)
3065 {
3066 x_ExtractAlgorithmOptions(args);
3067 }
3068 void
x_ExtractAlgorithmOptions(const CArgs & args)3069 CMTArgs::x_ExtractAlgorithmOptions(const CArgs& args)
3070 {
3071 const int kMaxValue = static_cast<int>(CSystemInfo::GetCpuCount());
3072
3073 if (args.Exist(kArgNumThreads) &&
3074 args[kArgNumThreads].HasValue()) { // could be cancelled by the exclusion in CRemoteArgs
3075
3076 // use the minimum of the two: user requested number of threads and
3077 // number of available CPUs for number of threads
3078 int num_threads = args[kArgNumThreads].AsInteger();
3079 if (num_threads > kMaxValue) {
3080 m_NumThreads = kMaxValue;
3081
3082 ERR_POST(Warning << (string)"Number of threads was reduced to " +
3083 NStr::IntToString((unsigned int)m_NumThreads) +
3084 " to match the number of available CPUs");
3085 }
3086 else {
3087 m_NumThreads = num_threads;
3088 }
3089
3090 // This is temporarily ignored (per SB-635)
3091 if (args.Exist(kArgSubject) && args[kArgSubject].HasValue() &&
3092 m_NumThreads != CThreadable::kMinNumThreads) {
3093 m_NumThreads = CThreadable::kMinNumThreads;
3094 string opt = kArgNumThreads;
3095 if (args.Exist(kArgMTMode) &&
3096 (args[kArgMTMode].AsInteger() == CMTArgs::eSplitByQueries)) {
3097 m_MTMode = CMTArgs::eSplitByDB;
3098 opt += " and " + kArgMTMode;
3099 }
3100 ERR_POST(Warning << "'" << opt << "' is currently "
3101 << "ignored when '" << kArgSubject << "' is specified.");
3102 return;
3103 }
3104 }
3105 if (args.Exist(kArgMTMode) && args[kArgMTMode].HasValue()) {
3106 m_MTMode = (EMTMode) args[kArgMTMode].AsInteger();
3107 }
3108
3109 }
3110
3111 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3112 CRemoteArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3113 {
3114 arg_desc.SetCurrentGroup("Miscellaneous options");
3115 arg_desc.AddFlag(kArgRemote, "Execute search remotely?", true);
3116
3117 arg_desc.SetCurrentGroup("");
3118 }
3119
3120 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions &)3121 CRemoteArgs::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& /* opts */)
3122 {
3123 if (args.Exist(kArgRemote)) {
3124 m_IsRemote = static_cast<bool>(args[kArgRemote]);
3125 }
3126 }
3127
3128 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3129 CDebugArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3130 {
3131 #if _BLAST_DEBUG
3132 arg_desc.SetCurrentGroup("Miscellaneous options");
3133 arg_desc.AddFlag("verbose", "Produce verbose output (show BLAST options)",
3134 true);
3135 arg_desc.AddFlag("remote_verbose",
3136 "Produce verbose output for remote searches", true);
3137 arg_desc.AddFlag("use_test_remote_service",
3138 "Send remote requests to test servers", true);
3139 arg_desc.SetCurrentGroup("");
3140 #endif /* _BLAST_DEBUG */
3141 }
3142
3143 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions &)3144 CDebugArgs::ExtractAlgorithmOptions(const CArgs& args, CBlastOptions& /* opts */)
3145 {
3146 #if _BLAST_DEBUG
3147 m_DebugOutput = static_cast<bool>(args["verbose"]);
3148 m_RmtDebugOutput = static_cast<bool>(args["remote_verbose"]);
3149 if (args["use_test_remote_service"]) {
3150 IRWRegistry& reg = CNcbiApplication::Instance()->GetConfig();
3151 reg.Set("BLAST4", DEF_CONN_REG_SECTION "_" REG_CONN_SERVICE_NAME,
3152 "blast4_test");
3153 }
3154 #endif /* _BLAST_DEBUG */
3155 }
3156
3157 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3158 CHspFilteringArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3159 {
3160 // culling limit
3161 arg_desc.SetCurrentGroup("Restrict search or results");
3162 arg_desc.AddOptionalKey(kArgCullingLimit, "int_value",
3163 "If the query range of a hit is enveloped by that of at "
3164 "least this many higher-scoring hits, delete the hit",
3165 CArgDescriptions::eInteger);
3166 arg_desc.SetConstraint(kArgCullingLimit,
3167 // best hit algorithm arguments
3168 new CArgAllowValuesGreaterThanOrEqual(kDfltArgCullingLimit));
3169
3170 arg_desc.AddOptionalKey(kArgBestHitOverhang, "float_value",
3171 "Best Hit algorithm overhang value "
3172 "(recommended value: " +
3173 NStr::DoubleToString(kDfltArgBestHitOverhang) +
3174 ")",
3175 CArgDescriptions::eDouble);
3176 arg_desc.SetConstraint(kArgBestHitOverhang,
3177 new CArgAllowValuesBetween(kBestHit_OverhangMin,
3178 kBestHit_OverhangMax));
3179 arg_desc.SetDependency(kArgBestHitOverhang,
3180 CArgDescriptions::eExcludes,
3181 kArgCullingLimit);
3182
3183 arg_desc.AddOptionalKey(kArgBestHitScoreEdge, "float_value",
3184 "Best Hit algorithm score edge value "
3185 "(recommended value: " +
3186 NStr::DoubleToString(kDfltArgBestHitScoreEdge) +
3187 ")",
3188 CArgDescriptions::eDouble);
3189 arg_desc.SetConstraint(kArgBestHitScoreEdge,
3190 new CArgAllowValuesBetween(kBestHit_ScoreEdgeMin,
3191 kBestHit_ScoreEdgeMax));
3192 arg_desc.SetDependency(kArgBestHitScoreEdge,
3193 CArgDescriptions::eExcludes,
3194 kArgCullingLimit);
3195 arg_desc.AddFlag(kArgSubjectBestHit, "Turn on best hit per subject sequence", true);
3196
3197 arg_desc.SetCurrentGroup("");
3198 }
3199
3200 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opts)3201 CHspFilteringArgs::ExtractAlgorithmOptions(const CArgs& args,
3202 CBlastOptions& opts)
3203 {
3204 if (args[kArgCullingLimit]) {
3205 opts.SetCullingLimit(args[kArgCullingLimit].AsInteger());
3206 }
3207 if (args[kArgBestHitOverhang]) {
3208 opts.SetBestHitOverhang(args[kArgBestHitOverhang].AsDouble());
3209 }
3210 if (args[kArgBestHitScoreEdge]) {
3211 opts.SetBestHitScoreEdge(args[kArgBestHitScoreEdge].AsDouble());
3212 }
3213 if (args[kArgSubjectBestHit]) {
3214 opts.SetSubjectBestHit();
3215 }
3216 }
3217
3218 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3219 CMbIndexArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3220 {
3221 arg_desc.SetCurrentGroup("General search options");
3222 arg_desc.AddDefaultKey(
3223 kArgUseIndex, "boolean",
3224 "Use MegaBLAST database index",
3225 CArgDescriptions::eBoolean, NStr::BoolToString(kDfltArgUseIndex));
3226 arg_desc.AddOptionalKey(
3227 kArgIndexName, "string",
3228 "MegaBLAST database index name (deprecated; use only for old style indices)",
3229 CArgDescriptions::eString );
3230 arg_desc.SetCurrentGroup( "" );
3231 }
3232
3233 bool
HasBeenSet(const CArgs & args)3234 CMbIndexArgs::HasBeenSet(const CArgs& args)
3235 {
3236 if ( (args.Exist(kArgUseIndex) && args[kArgUseIndex].HasValue()) ||
3237 (args.Exist(kArgIndexName) && args[kArgIndexName].HasValue()) ) {
3238 return true;
3239 }
3240 return false;
3241 }
3242
3243 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions & opts)3244 CMbIndexArgs::ExtractAlgorithmOptions(const CArgs& args,
3245 CBlastOptions& opts)
3246 {
3247 // MB Index does not apply to Blast2Sequences
3248 if( args.Exist( kArgUseIndex ) &&
3249 !(args.Exist( kArgSubject ) && args[kArgSubject])) {
3250
3251 bool use_index = true;
3252 bool force_index = false;
3253 bool old_style_index = false;
3254
3255 if( args[kArgUseIndex] ) {
3256 if( args[kArgUseIndex].AsBoolean() ) force_index = true;
3257 else use_index = false;
3258 }
3259
3260 if( args.Exist( kTask ) && args[kTask] &&
3261 args[kTask].AsString() != "megablast" ) {
3262 use_index = false;
3263 }
3264
3265 if( use_index ) {
3266 string index_name;
3267
3268 if( args.Exist( kArgIndexName ) && args[kArgIndexName] ) {
3269 index_name = args[kArgIndexName].AsString();
3270 old_style_index = true;
3271 }
3272 else if( args.Exist( kArgDb ) && args[kArgDb] ) {
3273 index_name = args[kArgDb].AsString();
3274 }
3275 else {
3276 NCBI_THROW(CInputException, eInvalidInput,
3277 "Can not deduce database index name" );
3278 }
3279
3280 opts.SetUseIndex( true, index_name, force_index, old_style_index );
3281 }
3282 }
3283 }
3284
3285 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3286 CStdCmdLineArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3287 {
3288 arg_desc.SetCurrentGroup("Input query options");
3289
3290 // query filename
3291 arg_desc.AddDefaultKey(kArgQuery, "input_file",
3292 "Input file name",
3293 CArgDescriptions::eInputFile, kDfltArgQuery);
3294 // for now it's either -query or -sra
3295 if( m_SRAaccessionEnabled ) {
3296 arg_desc.AddOptionalKey(kArgSraAccession, "accession",
3297 "Comma-separated SRA accessions",
3298 CArgDescriptions::eString);
3299 arg_desc.SetDependency(kArgSraAccession,
3300 CArgDescriptions::eExcludes,
3301 kArgQuery);
3302 }
3303
3304 arg_desc.SetCurrentGroup("General search options");
3305
3306 // report output file
3307 arg_desc.AddDefaultKey(kArgOutput, "output_file",
3308 "Output file name",
3309 CArgDescriptions::eOutputFile, "-");
3310
3311 if (m_GzipEnabled) {
3312 arg_desc.AddFlag(kArgOutputGzip, "Output will be compressed");
3313 }
3314
3315 arg_desc.SetCurrentGroup("");
3316 }
3317
3318 void
ExtractAlgorithmOptions(const CArgs & args,CBlastOptions &)3319 CStdCmdLineArgs::ExtractAlgorithmOptions(const CArgs& args,
3320 CBlastOptions& /* opt */)
3321 {
3322 if (args.Exist(kArgQuery) && args[kArgQuery].HasValue() &&
3323 m_InputStream == NULL) {
3324
3325 if (m_GzipEnabled &&
3326 NStr::EndsWith(args[kArgQuery].AsString(), ".gz", NStr::eNocase)) {
3327 m_DecompressIStream.reset(new CDecompressIStream(
3328 args[kArgQuery].AsInputFile(),
3329 CDecompressIStream::eGZipFile));
3330 m_InputStream = m_DecompressIStream.get();
3331 }
3332 else {
3333 m_InputStream = &args[kArgQuery].AsInputFile();
3334 }
3335 }
3336
3337 if (args.Exist(kArgOutputGzip) && args[kArgOutputGzip]) {
3338 m_CompressOStream.reset(new CCompressOStream(
3339 args[kArgOutput].AsOutputFile(),
3340 CCompressOStream::eGZipFile));
3341 m_OutputStream = m_CompressOStream.get();
3342 }
3343 else {
3344 m_OutputStream = &args[kArgOutput].AsOutputFile();
3345 }
3346
3347 // stream for unaligned reads in magicblast
3348 if (args.Exist(kArgUnalignedOutput) && args[kArgUnalignedOutput]) {
3349 if (args.Exist(kArgOutputGzip) && args[kArgOutputGzip]) {
3350 m_UnalignedCompressOStream.reset(new CCompressOStream(
3351 args[kArgUnalignedOutput].AsOutputFile(),
3352 CCompressOStream::eGZipFile));
3353 m_UnalignedOutputStream = m_UnalignedCompressOStream.get();
3354 }
3355 else {
3356 m_UnalignedOutputStream = &args[kArgUnalignedOutput].AsOutputFile();
3357 }
3358 }
3359 }
3360
3361 CNcbiIstream&
GetInputStream() const3362 CStdCmdLineArgs::GetInputStream() const
3363 {
3364 // programmer must ensure the ExtractAlgorithmOptions method is called
3365 // before this method is invoked
3366 if ( !m_InputStream ) {
3367 abort();
3368 }
3369 return *m_InputStream;
3370 }
3371
3372 CNcbiOstream&
GetOutputStream() const3373 CStdCmdLineArgs::GetOutputStream() const
3374 {
3375 // programmer must ensure the ExtractAlgorithmOptions method is called
3376 // before this method is invoked
3377 _ASSERT(m_OutputStream);
3378 return *m_OutputStream;
3379 }
3380
3381 void
SetInputStream(CRef<CTmpFile> input_file)3382 CStdCmdLineArgs::SetInputStream(CRef<CTmpFile> input_file)
3383 {
3384 m_QueryTmpInputFile = input_file;
3385 m_InputStream = &input_file->AsInputFile(CTmpFile::eIfExists_Throw);
3386 }
3387
3388 void
SetArgumentDescriptions(CArgDescriptions & arg_desc)3389 CSearchStrategyArgs::SetArgumentDescriptions(CArgDescriptions& arg_desc)
3390 {
3391 arg_desc.SetCurrentGroup("Search strategy options");
3392
3393 arg_desc.AddOptionalKey(kArgInputSearchStrategy,
3394 "filename",
3395 "Search strategy to use",
3396 CArgDescriptions::eInputFile);
3397 arg_desc.AddOptionalKey(kArgOutputSearchStrategy,
3398 "filename",
3399 "File name to record the search strategy used",
3400 CArgDescriptions::eOutputFile);
3401 arg_desc.SetDependency(kArgInputSearchStrategy,
3402 CArgDescriptions::eExcludes,
3403 kArgOutputSearchStrategy);
3404
3405 arg_desc.SetCurrentGroup("");
3406 }
3407
3408 void
ExtractAlgorithmOptions(const CArgs &,CBlastOptions &)3409 CSearchStrategyArgs::ExtractAlgorithmOptions(const CArgs& /* cmd_line_args */,
3410 CBlastOptions& /* options */)
3411 {
3412 }
3413
3414 CNcbiIstream*
GetImportStream(const CArgs & args) const3415 CSearchStrategyArgs::GetImportStream(const CArgs& args) const
3416 {
3417 CNcbiIstream* retval = NULL;
3418 if (args.Exist(kArgInputSearchStrategy) &&
3419 args[kArgInputSearchStrategy].HasValue()) {
3420 retval = &args[kArgInputSearchStrategy].AsInputFile();
3421 }
3422 return retval;
3423 }
3424
3425 CNcbiOstream*
GetExportStream(const CArgs & args) const3426 CSearchStrategyArgs::GetExportStream(const CArgs& args) const
3427 {
3428 CNcbiOstream* retval = NULL;
3429 if (args.Exist(kArgOutputSearchStrategy) &&
3430 args[kArgOutputSearchStrategy].HasValue()) {
3431 retval = &args[kArgOutputSearchStrategy].AsOutputFile();
3432 }
3433 return retval;
3434 }
3435
CBlastAppArgs()3436 CBlastAppArgs::CBlastAppArgs()
3437 {
3438 m_SearchStrategyArgs.Reset(new CSearchStrategyArgs);
3439 m_Args.push_back(CRef<IBlastCmdLineArgs>(&*m_SearchStrategyArgs));
3440 m_IsUngapped = false;
3441 }
3442
3443 CArgDescriptions*
SetCommandLine()3444 CBlastAppArgs::SetCommandLine()
3445 {
3446 return SetUpCommandLineArguments(m_Args);
3447 }
3448
3449 CRef<CBlastOptionsHandle>
SetOptions(const CArgs & args)3450 CBlastAppArgs::SetOptions(const CArgs& args)
3451 {
3452 // We're recovering from a saved strategy or combining
3453 // CBlastOptions/CBlastOptionsHandle with command line options (in GBench,
3454 // see GB-1116), so we need to still extract
3455 // certain options from the command line, include overriding query
3456 // and/or database
3457 if (m_OptsHandle.NotEmpty()) {
3458 CBlastOptions& opts = m_OptsHandle->SetOptions();
3459 //opts.DebugDumpText(cerr, "OptionsBeforeLoop", 1);
3460 const bool mbidxargs_set = CMbIndexArgs::HasBeenSet(args);
3461 const bool dbargs_set = CBlastDatabaseArgs::HasBeenSet(args);
3462 NON_CONST_ITERATE(TBlastCmdLineArgs, arg, m_Args) {
3463 if (dynamic_cast<CMbIndexArgs*>(&**arg)) {
3464 if (mbidxargs_set)
3465 (*arg)->ExtractAlgorithmOptions(args, opts);
3466 } else if (dynamic_cast<CBlastDatabaseArgs*>(&**arg)) {
3467 if (dbargs_set)
3468 m_BlastDbArgs->ExtractAlgorithmOptions(args, opts);
3469 } else {
3470 (*arg)->ExtractAlgorithmOptions(args, opts);
3471 }
3472 }
3473 m_IsUngapped = !opts.GetGappedMode();
3474 try { m_OptsHandle->Validate(); }
3475 catch (const CBlastException& e) {
3476 NCBI_THROW(CInputException, eInvalidInput, e.GetMsg());
3477 }
3478 //opts.DebugDumpText(cerr, "OptionsAfterLoop", 1);
3479 return m_OptsHandle;
3480 }
3481
3482 CBlastOptions::EAPILocality locality =
3483 (args.Exist(kArgRemote) && args[kArgRemote])
3484 ? CBlastOptions::eRemote
3485 : CBlastOptions::eLocal;
3486
3487 // This is needed as a CRemoteBlast object and its options are instantiated
3488 // to create the search strategy
3489 if (GetExportSearchStrategyStream(args) ||
3490 m_FormattingArgs->ArchiveFormatRequested(args)) {
3491 locality = CBlastOptions::eBoth;
3492 }
3493
3494 CRef<CBlastOptionsHandle> retval(x_CreateOptionsHandle(locality, args));
3495 CBlastOptions& opts = retval->SetOptions();
3496 NON_CONST_ITERATE(TBlastCmdLineArgs, arg, m_Args) {
3497 (*arg)->ExtractAlgorithmOptions(args, opts);
3498 }
3499
3500 m_IsUngapped = !opts.GetGappedMode();
3501 try { retval->Validate(); }
3502 catch (const CBlastException& e) {
3503 NCBI_THROW(CInputException, eInvalidInput, e.GetMsg());
3504 }
3505 return retval;
3506 }
3507
SetTask(const string & task)3508 void CBlastAppArgs::SetTask(const string& task)
3509 {
3510 #if _BLAST_DEBUG
3511 ThrowIfInvalidTask(task);
3512 #endif
3513 m_Task.assign(task);
3514 }
3515
3516 /// Get the input stream
GetInputStream()3517 CNcbiIstream& CBlastAppArgs::GetInputStream() {
3518 return m_StdCmdLineArgs->GetInputStream();
3519 }
3520 /// Get the output stream
GetOutputStream()3521 CNcbiOstream& CBlastAppArgs::GetOutputStream() {
3522 return m_StdCmdLineArgs->GetOutputStream();
3523 }
3524
3525 CArgDescriptions*
SetUpCommandLineArguments(TBlastCmdLineArgs & args)3526 SetUpCommandLineArguments(TBlastCmdLineArgs& args)
3527 {
3528 unique_ptr<CArgDescriptions> retval(new CArgDescriptions);
3529
3530 // Create the groups so that the ordering is established
3531 retval->SetCurrentGroup("Input query options");
3532 retval->SetCurrentGroup("General search options");
3533 retval->SetCurrentGroup("BLAST database options");
3534 retval->SetCurrentGroup("BLAST-2-Sequences options");
3535 retval->SetCurrentGroup("Formatting options");
3536 retval->SetCurrentGroup("Query filtering options");
3537 retval->SetCurrentGroup("Restrict search or results");
3538 retval->SetCurrentGroup("Discontiguous MegaBLAST options");
3539 retval->SetCurrentGroup("Statistical options");
3540 retval->SetCurrentGroup("Search strategy options");
3541 retval->SetCurrentGroup("Extension options");
3542 retval->SetCurrentGroup("");
3543
3544
3545 NON_CONST_ITERATE(TBlastCmdLineArgs, arg, args) {
3546 (*arg)->SetArgumentDescriptions(*retval);
3547 }
3548 return retval.release();
3549 }
3550
3551 CRef<CBlastOptionsHandle>
x_CreateOptionsHandleWithTask(CBlastOptions::EAPILocality locality,const string & task)3552 CBlastAppArgs::x_CreateOptionsHandleWithTask
3553 (CBlastOptions::EAPILocality locality, const string& task)
3554 {
3555 _ASSERT(!task.empty());
3556 CRef<CBlastOptionsHandle> retval;
3557 SetTask(task);
3558 retval.Reset(CBlastOptionsFactory::CreateTask(GetTask(), locality));
3559 _ASSERT(retval.NotEmpty());
3560 return retval;
3561 }
3562
3563 void
x_IssueWarningsForIgnoredOptions(const CArgs & args)3564 CBlastAppArgs::x_IssueWarningsForIgnoredOptions(const CArgs& args)
3565 {
3566 set<string> can_override;
3567 can_override.insert(kArgQuery);
3568 can_override.insert(kArgQueryLocation);
3569 can_override.insert(kArgSubject);
3570 can_override.insert(kArgSubjectLocation);
3571 can_override.insert(kArgUseLCaseMasking);
3572 can_override.insert(kArgDb);
3573 can_override.insert(kArgDbSize);
3574 can_override.insert(kArgEntrezQuery);
3575 can_override.insert(kArgDbSoftMask);
3576 can_override.insert(kArgDbHardMask);
3577 can_override.insert(kArgUseIndex);
3578 can_override.insert(kArgIndexName);
3579 can_override.insert(kArgStrand);
3580 can_override.insert(kArgParseDeflines);
3581 can_override.insert(kArgOutput);
3582 can_override.insert(kArgOutputFormat);
3583 can_override.insert(kArgNumDescriptions);
3584 can_override.insert(kArgNumAlignments);
3585 can_override.insert(kArgMaxTargetSequences);
3586 can_override.insert(kArgRemote);
3587 can_override.insert(kArgNumThreads);
3588 can_override.insert(kArgInputSearchStrategy);
3589 can_override.insert(kArgRemote);
3590 can_override.insert("remote_verbose");
3591 can_override.insert("verbose");
3592
3593 // this stores the arguments (and their defaults) that cannot be overriden
3594 map<string, string> has_defaults;
3595 EBlastProgramType prog = m_OptsHandle->GetOptions().GetProgramType();
3596 has_defaults[kArgCompBasedStats] =
3597 Blast_ProgramIsRpsBlast(prog) ? kDfltArgCompBasedStatsDelta:kDfltArgCompBasedStats;
3598 // FIX the line below for igblast, and add igblast options
3599 has_defaults[kArgEvalue] = NStr::DoubleToString(BLAST_EXPECT_VALUE);
3600 has_defaults[kTask] = m_Task;
3601 has_defaults[kArgOldStyleIndex] = kDfltArgOldStyleIndex;
3602
3603 if (Blast_QueryIsProtein(prog)) {
3604 if (NStr::Find(m_Task, "blastp") != NPOS ||
3605 NStr::Find(m_Task, "psiblast") != NPOS) {
3606 has_defaults[kArgSegFiltering] = kDfltArgNoFiltering;
3607 } else {
3608 has_defaults[kArgSegFiltering] = kDfltArgSegFiltering;
3609 }
3610 has_defaults[kArgLookupTableMaskingOnly] =
3611 kDfltArgLookupTableMaskingOnlyProt;
3612 has_defaults[kArgGapTrigger] =
3613 NStr::IntToString((int)BLAST_GAP_TRIGGER_PROT);
3614 } else {
3615 has_defaults[kArgDustFiltering] = kDfltArgDustFiltering;
3616 has_defaults[kArgLookupTableMaskingOnly] =
3617 kDfltArgLookupTableMaskingOnlyNucl;
3618 has_defaults[kArgGapTrigger] =
3619 NStr::IntToString((int)BLAST_GAP_TRIGGER_NUCL);
3620 }
3621 has_defaults[kArgOffDiagonalRange] =
3622 NStr::IntToString(kDfltOffDiagonalRange);
3623 has_defaults[kArgMaskLevel] = kDfltArgMaskLevel;
3624 has_defaults[kArgMaxIntronLength] =
3625 NStr::IntToString(kDfltArgMaxIntronLength);
3626 has_defaults[kArgQueryGeneticCode] = NStr::IntToString((int)BLAST_GENETIC_CODE);
3627 has_defaults[kArgDbGeneticCode] = NStr::IntToString((int)BLAST_GENETIC_CODE);
3628 // pssm engine/psiblast default options
3629 has_defaults[kArgPSIPseudocount] =
3630 NStr::IntToString(PSI_PSEUDO_COUNT_CONST);
3631 has_defaults[kArgPSIInclusionEThreshold] =
3632 NStr::DoubleToString(PSI_INCLUSION_ETHRESH);
3633 has_defaults[kArgPSINumIterations] =
3634 NStr::IntToString(kDfltArgPSINumIterations);
3635
3636 // get arguments, remove the supported ones and warn about those that
3637 // cannot be overridden.
3638 typedef vector< CRef<CArgValue> > TArgs;
3639 TArgs arguments = args.GetAll();
3640 ITERATE(TArgs, a, arguments) {
3641 const string& arg_name = (*a)->GetName();
3642 const string& arg_value = (*a)->AsString();
3643 // if it has a default value, ignore it if it's not different from the
3644 // default, otherwise, issue a warning
3645 if (has_defaults.find(arg_name) != has_defaults.end()) {
3646 if (has_defaults[arg_name] == arg_value) {
3647 continue;
3648 } else {
3649 if (arg_name == kTask && arg_value == "megablast") {
3650 // No need to issue warning here, as it's OK to change this
3651 continue;
3652 }
3653 ERR_POST(Warning << arg_name << " cannot be overridden when "
3654 "using a search strategy");
3655 }
3656 }
3657 // if the argument cannot be overridden, issue a warning
3658 if (can_override.find(arg_name) == can_override.end()) {
3659 ERR_POST(Warning << arg_name << " cannot be overridden when "
3660 "using a search strategy");
3661 }
3662 }
3663 }
3664
3665 CRef<CBlastOptionsHandle>
SetOptionsForSavedStrategy(const CArgs & args)3666 CBlastAppArgs::SetOptionsForSavedStrategy(const CArgs& args)
3667 {
3668 if(m_OptsHandle.Empty())
3669 {
3670 NCBI_THROW(CInputException, eInvalidInput, "Empty Blast Options Handle");
3671 }
3672
3673 // We're recovering from a saved strategy, so we need to still extract
3674 // certain options from the command line, include overriding query
3675 // and/or database
3676 CBlastOptions& opts = m_OptsHandle->SetOptions();
3677 // invoke ExtractAlgorithmOptions on certain argument classes, i.e.: those
3678 // that should have their arguments overriden
3679 m_QueryOptsArgs->ExtractAlgorithmOptions(args, opts);
3680 m_StdCmdLineArgs->ExtractAlgorithmOptions(args, opts);
3681 m_RemoteArgs->ExtractAlgorithmOptions(args, opts);
3682 m_DebugArgs->ExtractAlgorithmOptions(args, opts);
3683 m_FormattingArgs->ExtractAlgorithmOptions(args, opts);
3684 m_MTArgs->ExtractAlgorithmOptions(args, opts);
3685 if (CBlastDatabaseArgs::HasBeenSet(args)) {
3686 m_BlastDbArgs->ExtractAlgorithmOptions(args, opts);
3687 }
3688 if (CMbIndexArgs::HasBeenSet(args)) {
3689 NON_CONST_ITERATE(TBlastCmdLineArgs, arg, m_Args) {
3690 if (dynamic_cast<CMbIndexArgs*>(arg->GetPointer()) != NULL) {
3691 (*arg)->ExtractAlgorithmOptions(args, opts);
3692 }
3693 }
3694 }
3695 m_IsUngapped = !opts.GetGappedMode();
3696 x_IssueWarningsForIgnoredOptions(args);
3697 try { m_OptsHandle->Validate(); }
3698 catch (const CBlastException& e) {
3699 NCBI_THROW(CInputException, eInvalidInput, e.GetMsg());
3700 }
3701 return m_OptsHandle;
3702 }
3703
3704 END_SCOPE(blast)
3705 END_NCBI_SCOPE
3706