1 /* $Id: makeprofiledb.cpp 631568 2021-05-19 13:53:50Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Amelia Fong
27 *
28 */
29
30 /** @file makeprofiledb.cpp
31 * Command line tool to create RPS,COBALT & DELTA BLAST databases.
32 * This is the successor to formatrpsdb from the C toolkit
33 */
34
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbimisc.hpp>
38 #include <corelib/ncbitime.hpp>
39 #include <util/math/matrix.hpp>
40 #include <serial/objistrasn.hpp>
41 #include <algo/blast/api/version.hpp>
42 #include <algo/blast/blastinput/cmdline_flags.hpp>
43 #include <algo/blast/blastinput/blast_input.hpp>
44 #include <algo/blast/api/pssm_engine.hpp>
45 #include <algo/blast/api/psi_pssm_input.hpp>
46 #include <objects/scoremat/PssmWithParameters.hpp>
47 #include <objects/scoremat/Pssm.hpp>
48 #include <objects/scoremat/PssmIntermediateData.hpp>
49 #include <objects/scoremat/PssmParameters.hpp>
50 #include <objects/scoremat/FormatRpsDbParameters.hpp>
51 #include <objects/scoremat/PssmFinalData.hpp>
52 #include <objects/scoremat/CoreBlock.hpp>
53 #include <objects/scoremat/CoreDef.hpp>
54 #include <objects/scoremat/LoopConstraint.hpp>
55 #include <algo/blast/core/blast_aalookup.h>
56 #include <algo/blast/core/blast_options.h>
57 #include <algo/blast/core/ncbi_math.h>
58 #include <objtools/blast/seqdb_writer/writedb.hpp>
59 #include <objtools/blast/seqdb_writer/build_db.hpp>
60 #include <objtools/blast/seqdb_writer/taxid_set.hpp>
61 #include <objtools/blast/seqdb_writer/writedb_files.hpp>
62 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
63 #include "../blast/blast_app_util.hpp"
64
65 #ifndef SKIP_DOXYGEN_PROCESSING
66 USING_NCBI_SCOPE;
67 USING_SCOPE(blast);
68 USING_SCOPE(objects);
69 #endif /* SKIP_DOXYGEN_PROCESSING */
70
71
72 //Input args specify to makeprofiledb
73 static const string kInPssmList("in");
74 static const string kOutDbName("out");
75 static const string kOutDbType("dbtype");
76 static const string kPssmScaleFactor("scale");
77 static const string kOutIndexFile("index");
78 static const string kObsrThreshold("obsr_threshold");
79 static const string kExcludeInvalid("exclude_invalid");
80 static const string kBinaryScoremat("binary");
81 static const string kUseCmdlineThreshold("force");
82 static const string kMaxSmpFilesPerVol("max_smp_vol");
83
84 static const string kLogFile("logfile");
85
86 //Supported Output Database Types
87 static const string kOutDbRps = "rps";
88 static const string kOutDbCobalt = "cobalt";
89 static const string kOutDbDelta = "delta";
90
91 //Supported Matrices
92 static const string kMatrixBLOSUM62 = "BLOSUM62";
93 static const string kMatrixBLOSUM80 = "BLOSUM80";
94 static const string kMatrixBLOSUM50 = "BLOSUM50";
95 static const string kMatrixBLOSUM45 = "BLOSUM45";
96 static const string kMatrixBLOSUM90 = "BLOSUM90";
97 static const string kMatrixPAM250 = "PAM250";
98 static const string kMatrixPAM30 = "PAM30";
99 static const string kMatrixPAM70 = "PAM70";
100
101 //Default Input Values
102 static const string kDefaultMatrix(kMatrixBLOSUM62);
103 static const string kDefaultOutDbType(kOutDbRps);
104 static const string kDefaultOutIndexFile("true");
105 static const string kDefaultExcludeInvalid("true");
106 #define kDefaultWordScoreThreshold (9.82)
107 #define kDefaultPssmScaleFactor (100.00)
108 #define kDefaultObsrThreshold (6.0)
109 #define kDefaultMaxSmpFilesPerVol (2500)
110
111 //Fix point scale factor for delta blast
112 static const Uint4 kFixedPointScaleFactor = 1000;
113 #define kEpsylon (0.0001)
114
115 #define DEFAULT_POS_MATRIX_SIZE 2000
116 #define RPS_NUM_LOOKUP_CELLS 32768
117 #if BLASTAA_SIZE == 28
118 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
119 #else
120 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
121 #endif
122
123 #define kSingleVol (-1)
124
125 class CMakeDbPosMatrix
126 {
127 public:
CMakeDbPosMatrix()128 CMakeDbPosMatrix(): m_posMatrix(NULL), m_size(0) { };
~CMakeDbPosMatrix()129 ~CMakeDbPosMatrix(){Delete();};
130
131 void Create(int seq_size);
132 void Delete(void);
133
Get(void)134 Int4 ** Get(void) { return m_posMatrix;};
GetSize(void)135 unsigned int GetSize(void){return m_size;};
136
137 private:
138
139 Int4 ** m_posMatrix;
140 int m_size;
141 };
142
Create(int size)143 void CMakeDbPosMatrix::Create(int size)
144 {
145 Delete();
146
147 m_posMatrix = new Int4* [size];
148
149 for(int i = 0; i < size; ++ i)
150 {
151 m_posMatrix[i] = new Int4[BLASTAA_SIZE];
152 }
153 m_size = size;
154
155 return;
156 }
157
Delete(void)158 void CMakeDbPosMatrix::Delete(void)
159 {
160 if( NULL == m_posMatrix)
161 return;
162
163 for(int i = 0; i < m_size; ++ i)
164 {
165 if (m_posMatrix[i] != NULL)
166 delete [] m_posMatrix[i];
167 }
168
169 delete [] m_posMatrix;
170 m_posMatrix = NULL;
171 return;
172 }
173
174 class CMakeProfileDBApp : public CNcbiApplication
175 {
176 public:
177 /** @inheritDoc */
178 CMakeProfileDBApp(void);
179 ~CMakeProfileDBApp();
180 private:
181 /** @inheritDoc */
182 virtual void Init();
183 /** @inheritDoc */
184 virtual int Run();
185
186 enum op_mode
187 {
188 op_rps ,
189 op_cobalt,
190 op_delta ,
191 op_invalid
192 };
193
194 class CRPS_DbInfo
195 {
196 public:
197 string db_name;
198 Int4 num_seqs;
199 CNcbiOfstream lookup_file;
200 CNcbiOfstream pssm_file;
201 CNcbiOfstream aux_file;
202 CNcbiOfstream blocks_file;
203 CNcbiOfstream freq_file;
204
205 CMakeDbPosMatrix pos_matrix;
206 Int4 gap_open;
207 Int4 gap_extend;
208 Int4 scale_factor;
209 Int4 curr_seq_offset;
210 QuerySetUpOptions *query_options;
211 LookupTableOptions *lookup_options;
212 BlastAaLookupTable *lookup;
213 string matrix;
214 CRef<CWriteDB> output_db;
215
CRPS_DbInfo(void)216 CRPS_DbInfo(void):
217 db_name(kEmptyStr), num_seqs(0),
218 gap_open(0), gap_extend(0),scale_factor(0), curr_seq_offset(0),
219 query_options(NULL), lookup_options(NULL), lookup(NULL)
220 { };
~CRPS_DbInfo()221 ~CRPS_DbInfo()
222 {
223 if( NULL != query_options) {
224 BlastQuerySetUpOptionsFree(query_options);
225 }
226
227 if(NULL != lookup) {
228 BlastAaLookupTableDestruct(lookup);
229 }
230
231 if(NULL != lookup_options) {
232 LookupTableOptionsFree(lookup_options);
233 }
234 };
235 };
236
237 enum CheckInputScoremat_RV
238 {
239 sm_valid_has_pssm,
240 sm_valid_freq_only,
241 sm_invalid
242 };
243
244 enum
245 {
246 eUndefined = -1,
247 eFalse,
248 eTrue
249 };
250
251 CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters & pssm_w_parameters,
252 const string & filename);
253 void x_SetupArgDescriptions(void);
254 void x_InitProgramParameters(void);
255 vector<string> x_GetSMPFilenames(void);
256 void x_InitOutputDb(CRPS_DbInfo & rpsDBInfo);
257 void x_InitRPSDbInfo(CRPS_DbInfo & rpsDBInfo, Int4 vol, Int4 num_files);
258 void x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p);
259 void x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only);
260 void x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size);
261 void x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size);
262 void x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_p);
263 void x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size);
264 void x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo);
265 void x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size);
266 bool x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename);
267 bool x_ValidateCd(const list<double>& freqs, const list<double>& observ, unsigned int alphabet_size);
268 void x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
269 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset);
270 vector<string> x_CreateDeltaList(void);
271 void x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size);
272 void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
273 bool x_IsUpdateFreqRatios(const CPssm & p);
274 void x_MakeVol(Int4 vol, vector<string> & smps);
275
276 int x_Run(void);
277
278 void x_AddCmdOptions(void);
279 void x_CreateAliasFile(void);
280
281 // Data
282 CNcbiOstream * m_LogFile;
283 CNcbiIstream * m_InPssmList;
284 string m_Title;
285 double m_WordDefaultScoreThreshold;
286 string m_OutDbName;
287 string m_OutDbType;
288 bool m_CreateIndexFile;
289 int m_GapOpenPenalty;
290 int m_GapExtPenalty;
291 double m_PssmScaleFactor;
292 string m_Matrix;
293 op_mode m_op_mode;
294 bool m_binary_scoremat;
295 int m_MaxSmpFilesPerVol;
296 int m_NumOfVols;
297
298 EBlastDbVersion m_DbVer;
299 CRef<CTaxIdSet> m_Taxids;
300 bool m_Done;
301
302 //For Delta Blast
303 double m_ObsrvThreshold;
304 bool m_ExcludeInvalid;
305
306 int m_UpdateFreqRatios;
307 bool m_UseModelThreshold;
308
309 vector<string> m_VolNames;
310 CBlastUsageReport m_UsageReport;
311 CStopWatch m_StopWatch;
312 };
313
CMakeProfileDBApp(void)314 CMakeProfileDBApp::CMakeProfileDBApp(void)
315 : m_LogFile(NULL), m_InPssmList(NULL), m_Title(kEmptyStr),
316 m_WordDefaultScoreThreshold(0), m_OutDbName(kEmptyStr),
317 m_OutDbType(kEmptyStr), m_CreateIndexFile(false),m_GapOpenPenalty(0),
318 m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(kEmptyStr), m_op_mode(op_invalid),
319 m_binary_scoremat(false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(eBDB_Version5),
320 m_Taxids(new CTaxIdSet()), m_Done(false),
321 m_ObsrvThreshold(0), m_ExcludeInvalid(false),
322 m_UpdateFreqRatios(eUndefined), m_UseModelThreshold(true)
323 {
324 CRef<CVersion> version(new CVersion());
325 version->SetVersionInfo(new CBlastVersion());
326 SetFullVersion(version);
327 m_StopWatch.Start();
328 if (m_UsageReport.IsEnabled()) {
329 m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
330 m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeprofiledb");
331 }
332 }
333
~CMakeProfileDBApp()334 CMakeProfileDBApp::~CMakeProfileDBApp()
335 {
336 // NEED CLEAN UP CODE !!!!
337 if(m_Done == false)
338 {
339 for(unsigned int i =0; i < m_VolNames.size(); i ++)
340 {
341 string rps_str = m_VolNames[i] + ".rps";
342 string lookup_str = m_VolNames[i] + ".loo";
343 string aux_str = m_VolNames[i] + ".aux";
344 string freq_str = m_VolNames[i] + ".freq";
345 CFile(rps_str).Remove();
346 CFile(lookup_str).Remove();
347 CFile(aux_str).Remove();
348 CFile(freq_str).Remove();
349
350 if(op_cobalt == m_op_mode)
351 {
352 string blocks_str = m_VolNames[i] + ".blocks";
353 CFile(blocks_str).Remove();
354 }
355
356 if(op_delta == m_op_mode)
357 {
358 string wcounts_str = m_VolNames[i] + ".wcounts";
359 string obsr_str = m_VolNames[i] + ".obsr";
360 CFile(wcounts_str).Remove();
361 CFile(obsr_str).Remove();
362 }
363 }
364 if (m_VolNames.size() > 1) {
365 string pal_str = m_OutDbName + ".pal";
366 CFile(pal_str).Remove();
367 }
368 }
369 else
370 {
371 for(unsigned int i =0; i < m_VolNames.size(); i ++) {
372 string pog_str = m_VolNames[i] + ".pog";
373 CFile(pog_str).Remove();
374 }
375 }
376 m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
377 }
378
x_SetupArgDescriptions(void)379 void CMakeProfileDBApp::x_SetupArgDescriptions(void)
380 {
381 HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
382
383 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
384
385 // Specify USAGE context
386 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
387 "Application to create databases for rpsblast, cobalt and deltablast, version "
388 + CBlastVersion().Print());
389
390 string dflt("Default = input file name provided to -");
391 dflt += kInPssmList + " argument";
392
393 arg_desc->SetCurrentGroup("Input options");
394 arg_desc->AddKey(kInPssmList, "in_pssm_list",
395 "Input file that contains a list of smp files (delimited by space, tab or newline)",
396 CArgDescriptions::eInputFile);
397
398 arg_desc->AddFlag(kBinaryScoremat,
399 "Scoremats are in binary format",
400 true);
401
402 arg_desc->SetCurrentGroup("Configuration options");
403 arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
404 "Title for database\n" + dflt,
405 CArgDescriptions::eString);
406
407 arg_desc->AddDefaultKey(kArgWordScoreThreshold, "word_score_threshold",
408 "Minimum word score to add a word to the lookup table",
409 CArgDescriptions::eDouble,
410 NStr::DoubleToString(kDefaultWordScoreThreshold));
411 arg_desc->AddFlag(kUseCmdlineThreshold, "Use cmdline threshold", true);
412
413 arg_desc->SetCurrentGroup("Output options");
414 arg_desc->AddOptionalKey(kOutDbName, "database_name",
415 "Name of database to be created\n" +
416 dflt , CArgDescriptions::eString);
417
418 arg_desc->AddDefaultKey("blastdb_version", "version",
419 "Version of BLAST database to be created",
420 CArgDescriptions::eInteger,
421 NStr::NumericToString(static_cast<int>(eBDB_Version5)));
422 arg_desc->SetConstraint("blastdb_version",
423 new CArgAllow_Integers(eBDB_Version4, eBDB_Version5));
424
425 arg_desc->AddDefaultKey(kMaxSmpFilesPerVol, "max_smp_files_per_vol",
426 "Maximum number of SMP files per DB volume",
427 CArgDescriptions::eInteger, NStr::IntToString(kDefaultMaxSmpFilesPerVol));
428
429 arg_desc->AddDefaultKey(kOutDbType, "output_db_type",
430 "Output database type: cobalt, delta, rps",
431 CArgDescriptions::eString, kDefaultOutDbType);
432 arg_desc->SetConstraint(kOutDbType, &(*new CArgAllow_Strings, kOutDbRps, kOutDbCobalt , kOutDbDelta ));
433
434 arg_desc->AddDefaultKey(kOutIndexFile, "create_index_files",
435 "Create Index Files",
436 CArgDescriptions::eBoolean, kDefaultOutIndexFile);
437
438 arg_desc->SetCurrentGroup("Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
439 arg_desc->AddOptionalKey(kArgGapOpen, "gap_open_penalty",
440 "Cost to open a gap",
441 CArgDescriptions::eInteger);
442
443 arg_desc->AddOptionalKey(kArgGapExtend, "gap_extend_penalty",
444 "Cost to extend a gap, ",
445 CArgDescriptions::eInteger);
446
447 arg_desc->AddDefaultKey(kPssmScaleFactor, "pssm_scale_factor",
448 "Pssm Scale factor ",
449 CArgDescriptions::eDouble,
450 NStr::DoubleToString(kDefaultPssmScaleFactor));
451
452 arg_desc->AddDefaultKey(kArgMatrixName, "matrix_name",
453 "Scoring matrix name",
454 CArgDescriptions::eString,
455 kDefaultMatrix);
456 arg_desc->SetConstraint(kArgMatrixName, &(*new CArgAllow_Strings,kMatrixBLOSUM62, kMatrixBLOSUM80,
457 kMatrixBLOSUM50, kMatrixBLOSUM45, kMatrixBLOSUM90, kMatrixPAM250, kMatrixPAM30, kMatrixPAM70));
458
459 //Delta Blast Options
460 arg_desc->SetCurrentGroup("Delta Blast Options");
461 arg_desc->AddDefaultKey(kObsrThreshold, "observations_threshold", "Exclude domains with "
462 "with maximum number of independent observations "
463 "below this threshold", CArgDescriptions::eDouble,
464 NStr::DoubleToString(kDefaultObsrThreshold));
465
466 arg_desc->AddDefaultKey(kExcludeInvalid, "exclude_invalid", "Exclude domains that do "
467 "not pass validation test",
468 CArgDescriptions::eBoolean, kDefaultExcludeInvalid);
469
470 arg_desc->SetCurrentGroup("Taxonomy options");
471 arg_desc->AddOptionalKey("taxid", "TaxID",
472 "Taxonomy ID to assign to all sequences",
473 CArgDescriptions::eInteger);
474 arg_desc->SetConstraint("taxid", new CArgAllowValuesGreaterThanOrEqual(0));
475 arg_desc->SetDependency("taxid", CArgDescriptions::eExcludes, "taxid_map");
476
477 arg_desc->AddOptionalKey("taxid_map", "TaxIDMapFile",
478 "Text file mapping sequence IDs to taxonomy IDs.\n"
479 "Format:<SequenceId> <TaxonomyId><newline>",
480 CArgDescriptions::eInputFile);
481
482 SetupArgDescriptions(arg_desc.release());
483 }
484
x_InitProgramParameters(void)485 void CMakeProfileDBApp::x_InitProgramParameters(void)
486 {
487 const CArgs& args = GetArgs();
488
489 //log_file
490 if (args[kLogFile].HasValue())
491 m_LogFile = &args[kLogFile].AsOutputFile();
492 else
493 m_LogFile = &cout;
494
495
496 //in_list
497 if (args[kInPssmList].HasValue())
498 m_InPssmList = &args[kInPssmList].AsInputFile();
499 else
500 NCBI_THROW(CInputException, eInvalidInput, "Please provide an input file with list of smp files");
501
502 // Binary Scoremat
503 m_binary_scoremat = args[kBinaryScoremat];
504
505 //title
506 if (args[kArgDbTitle].HasValue())
507 m_Title = args[kArgDbTitle].AsString();
508 else
509 m_Title = args[kInPssmList].AsString();
510
511 //threshold
512 m_WordDefaultScoreThreshold = args[kArgWordScoreThreshold].AsDouble();
513
514 //Out
515 if(args[kOutDbName].HasValue())
516 m_OutDbName = args[kOutDbName].AsString();
517 else
518 m_OutDbName = args[kInPssmList].AsString();
519
520 //Number of SMP files per db vol
521 m_MaxSmpFilesPerVol = args[kMaxSmpFilesPerVol].AsInteger();
522
523 //out_db_type
524 m_OutDbType = args[kOutDbType].AsString();
525 if(kOutDbRps == m_OutDbType)
526 m_op_mode = op_rps;
527 else if (kOutDbCobalt == m_OutDbType)
528 m_op_mode = op_cobalt;
529 else if(kOutDbDelta == m_OutDbType)
530 m_op_mode = op_delta;
531 else
532 NCBI_THROW(CInputException, eInvalidInput, "Invalid Output database type");
533
534 m_CreateIndexFile = args[kOutIndexFile].AsBoolean();
535
536 int default_gap_open = 0;
537 int default_gap_extend = 0;
538 //matrix
539 m_Matrix = args[kArgMatrixName].AsString();
540 BLAST_GetProteinGapExistenceExtendParams(m_Matrix.c_str(), &default_gap_open, &default_gap_extend);
541
542 //gapopen
543 if(args[kArgGapOpen].HasValue())
544 m_GapOpenPenalty = args[kArgGapOpen].AsInteger();
545 else
546 m_GapOpenPenalty = default_gap_open;
547
548 //gapextend
549 if(args[kArgGapExtend].HasValue())
550 m_GapExtPenalty = args[kArgGapExtend].AsInteger();
551 else
552 m_GapExtPenalty = default_gap_extend;
553
554 //pssm scale factor
555 m_PssmScaleFactor = args[kPssmScaleFactor].AsDouble();
556
557 //matrix
558 m_Matrix = args[kArgMatrixName].AsString();
559
560 //Delta Blast Parameters
561 m_ObsrvThreshold = args[kObsrThreshold].AsDouble();
562 m_ExcludeInvalid = args[kExcludeInvalid].AsBoolean();
563
564 if (args[kUseCmdlineThreshold]){
565 m_UseModelThreshold = false;
566 }
567 m_DbVer = static_cast<EBlastDbVersion>(args["blastdb_version"].AsInteger());
568
569 if (args["taxid"].HasValue()) {
570 _ASSERT( !args["taxid_map"].HasValue() );
571 m_Taxids.Reset(new CTaxIdSet(TAX_ID_FROM(int, args["taxid"].AsInteger())));
572 } else if (args["taxid_map"].HasValue()) {
573 _ASSERT( !args["taxid"].HasValue() );
574 _ASSERT( !m_Taxids.Empty() );
575 m_Taxids->SetMappingFromFile(args["taxid_map"].AsInputFile());
576 }
577 }
578
x_GetSMPFilenames(void)579 vector<string> CMakeProfileDBApp::x_GetSMPFilenames(void)
580 {
581 vector<string> filenames;
582
583 while(!m_InPssmList->eof())
584 {
585 string line;
586 vector<string> tmp;
587 NcbiGetlineEOL(*m_InPssmList, line);
588 NStr::Split(line, " \t\r", tmp, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
589
590 if(tmp.size() > 0)
591 filenames.insert(filenames.end(), tmp.begin(), tmp.end() );
592 }
593
594 if( 0 == filenames.size())
595 NCBI_THROW(CInputException, eInvalidInput, "Input file contains no smp filnames");
596
597 return filenames;
598 }
599
600 CMakeProfileDBApp::CheckInputScoremat_RV
x_CheckInputScoremat(const CPssmWithParameters & pssm_w_parameters,const string & filename)601 CMakeProfileDBApp::x_CheckInputScoremat(const CPssmWithParameters & pssm_w_parameters,
602 const string & filename)
603 {
604 CheckInputScoremat_RV sm = sm_invalid;
605
606 if(pssm_w_parameters.IsSetPssm())
607 {
608 const CPssm & pssm = pssm_w_parameters.GetPssm();
609
610 if(!pssm.IsSetQuery() || (0 == pssm.GetQueryLength()))
611 {
612 string err = filename + " contians no bioseq data";
613 NCBI_THROW(CInputException, eInvalidInput, err);
614 }
615
616 if(!pssm.IsSetNumRows() || !pssm.IsSetNumColumns())
617 {
618 string err = filename + " contians no info on num of columns or num of rows";
619 NCBI_THROW(CInputException, eInvalidInput, err);
620 }
621
622 if((int) (pssm.GetQueryLength()) != pssm.GetNumColumns())
623 {
624 string err = filename + " 's num of columns does not match size of sequence";
625 NCBI_THROW(CInputException, eInvalidInput, err);
626 }
627
628 int num_rows = pssm.GetNumRows();
629 if( num_rows <= 0 || num_rows > BLASTAA_SIZE )
630 {
631 string err = filename + " has invalid alphabet size";
632 NCBI_THROW(CInputException, eInvalidInput, err);
633 }
634
635 // First time around
636 if(eUndefined == m_UpdateFreqRatios)
637 {
638 m_UpdateFreqRatios = x_IsUpdateFreqRatios(pssm);
639 }
640
641 if(m_UpdateFreqRatios && (!pssm.IsSetIntermediateData()|| !pssm.GetIntermediateData().IsSetFreqRatios()))
642 {
643 string err = filename + " contains no frequence ratios for building database";
644 NCBI_THROW(CInputException, eInvalidInput, err);
645 }
646
647 if(op_cobalt == m_op_mode)
648 {
649 if(!pssm_w_parameters.IsSetParams() || !pssm_w_parameters.GetParams().IsSetConstraints() ||
650 ! pssm_w_parameters.GetParams().GetConstraints().IsSetBlocks())
651 {
652 string err = filename + " contains no core block to build cobalt database";
653 NCBI_THROW(CInputException, eInvalidInput, err);
654 }
655 }
656
657 if(pssm.IsSetFinalData())
658 {
659 sm = sm_valid_has_pssm;
660 }
661 else if(pssm.IsSetIntermediateData())
662 {
663 if(pssm.GetIntermediateData().IsSetFreqRatios())
664 {
665 sm = sm_valid_freq_only;
666 }
667 }
668
669 if(sm_invalid == sm)
670 {
671 string err = filename + " contians no pssm or residue frequencies";
672 NCBI_THROW(CInputException, eInvalidInput, err);
673 }
674 }
675 else
676 {
677 string err = filename + " contians no scoremat";
678 NCBI_THROW(CInputException, eInvalidInput, err);
679 }
680
681 return sm;
682 }
683
x_IsUpdateFreqRatios(const CPssm & p)684 bool CMakeProfileDBApp::x_IsUpdateFreqRatios(const CPssm & p)
685 {
686 if(op_cobalt == m_op_mode)
687 return eTrue;
688
689 if(!p.IsSetIntermediateData()|| !p.GetIntermediateData().IsSetFreqRatios())
690 return eFalse;
691
692 return eTrue;
693 }
694
x_InitOutputDb(CRPS_DbInfo & rpsDbInfo)695 void CMakeProfileDBApp::x_InitOutputDb(CRPS_DbInfo & rpsDbInfo)
696 {
697 CWriteDB::EIndexType index_type = (m_CreateIndexFile == true ? CWriteDB::eDefault : CWriteDB::eNoIndex);
698 rpsDbInfo.output_db.Reset(new CWriteDB(rpsDbInfo.db_name, CWriteDB::eProtein, m_Title, index_type, m_CreateIndexFile, false, false, m_DbVer));
699 rpsDbInfo.output_db->SetMaxFileSize(4000000000);
700 return;
701 }
702
s_DeleteMakeprofileDb(const string & name)703 static bool s_DeleteMakeprofileDb(const string & name )
704 {
705 bool isRemoved = false;
706 static const char * mp_ext[]={".rps", ".loo", ".aux", ".freq", ".blocks", ".wcounts", ".obsr", NULL};
707 for(const char ** mp=mp_ext; *mp != NULL; mp++) {
708 CNcbiOstrstream oss;
709 oss << name << *mp;
710 const string fname = CNcbiOstrstreamToString(oss);
711 if (CFile(fname).Remove()) {
712 LOG_POST(Info << "Deleted " << fname);
713 }
714 else {
715 unsigned int index = 0;
716 string vfname = name + "." + NStr::IntToString(index/10) +
717 NStr::IntToString(index%10) + *mp;
718 while (CFile(vfname).Remove()) {
719 index++;
720 vfname = name + "." + NStr::IntToString(index/10) +
721 NStr::IntToString(index%10) + *mp;
722 }
723 }
724 }
725 if(DeleteBlastDb(name, CSeqDB::eProtein))
726 isRemoved = true;
727
728 return isRemoved;
729 }
730
731
x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo,Int4 vol,Int4 num_files)732 void CMakeProfileDBApp::x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo, Int4 vol, Int4 num_files)
733 {
734
735 rpsDbInfo.num_seqs = num_files;
736 if(vol == kSingleVol) {
737 rpsDbInfo.db_name = m_OutDbName;
738 }
739 else if (vol >= 0) {
740 rpsDbInfo.db_name = CWriteDB_File::MakeShortName(m_OutDbName, vol);
741 }
742 else {
743 NCBI_THROW(CBlastException, eCoreBlastError,"Invalid vol number");
744 }
745
746 string rps_str = rpsDbInfo.db_name + ".rps";
747 rpsDbInfo.pssm_file.open(rps_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
748 if (!rpsDbInfo.pssm_file.is_open())
749 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .rps file ");
750
751 string lookup_str = rpsDbInfo.db_name + ".loo";
752 rpsDbInfo.lookup_file.open(lookup_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
753 if (!rpsDbInfo.lookup_file.is_open())
754 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .loo file");
755
756 string aux_str = rpsDbInfo.db_name + ".aux";
757 rpsDbInfo.aux_file.open(aux_str.c_str());
758 if (!rpsDbInfo.aux_file.is_open())
759 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .aux file");
760
761 string freq_str = rpsDbInfo.db_name + ".freq";
762 rpsDbInfo.freq_file.open(freq_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
763 if (!rpsDbInfo.freq_file.is_open())
764 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .freq file");
765
766 /* Write the magic numbers to the PSSM file */
767
768 Int4 version = RPS_DATABASE_VERSION;
769 rpsDbInfo.pssm_file.write ((char *)&version , sizeof(Int4));
770 rpsDbInfo.freq_file.write ((char *)&version , sizeof(Int4));
771
772 /* Fill in space for the sequence offsets. The PSSM
773 data gets written after this list of integers. Also
774 write the number of sequences to the PSSM file */
775
776 rpsDbInfo.pssm_file.write((char *) &num_files, sizeof(Int4));
777 rpsDbInfo.freq_file.write((char *) &num_files, sizeof(Int4));
778 for (Int4 i = 0; i <= num_files; i++)
779 {
780 rpsDbInfo.pssm_file.write((char *)&i, sizeof(Int4));
781 rpsDbInfo.freq_file.write((char *)&i, sizeof(Int4));
782 }
783
784 if(op_cobalt == m_op_mode)
785 {
786 string blocks_str = rpsDbInfo.db_name + ".blocks";
787 rpsDbInfo.blocks_file.open(blocks_str.c_str());
788 if (!rpsDbInfo.blocks_file.is_open())
789 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .blocks file");
790 }
791
792
793 rpsDbInfo.curr_seq_offset = 0;
794 //Init them to input arg values first , may change after reading in the first sequence
795 rpsDbInfo.gap_extend = m_GapExtPenalty;
796 rpsDbInfo.gap_open = m_GapOpenPenalty;
797 rpsDbInfo.matrix = m_Matrix;
798 rpsDbInfo.scale_factor = (Int4) ceil(m_PssmScaleFactor);
799
800 return;
801 }
802
803 //For first sequence only
x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo,const CPssmWithParameters & pssm_p)804 void CMakeProfileDBApp::x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p)
805 {
806 if(pssm_p.IsSetParams())
807 {
808 if(pssm_p.GetParams().IsSetRpsdbparams())
809 {
810 const CFormatRpsDbParameters & rps_db_params = pssm_p.GetParams().GetRpsdbparams();
811 if(rps_db_params.IsSetGapExtend())
812 rpsDbInfo.gap_extend = rps_db_params.GetGapExtend();
813
814 if(rps_db_params.IsSetGapOpen())
815 rpsDbInfo.gap_open = rps_db_params.GetGapOpen();
816
817 if(rps_db_params.IsSetMatrixName())
818 rpsDbInfo.matrix = rps_db_params.GetMatrixName();
819 }
820 }
821 return;
822 }
823
x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo,CPssmWithParameters & pssm_p)824 void CMakeProfileDBApp::x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters &pssm_p )
825 {
826 if(!pssm_p.IsSetParams())
827 pssm_p.SetParams();
828
829 if(!pssm_p.GetParams().IsSetRpsdbparams())
830 pssm_p.SetParams().SetRpsdbparams();
831
832 CFormatRpsDbParameters & rps_params= pssm_p.SetParams().SetRpsdbparams();
833 if(!rps_params.IsSetGapExtend())
834 rps_params.SetGapExtend(rpsDbInfo.gap_extend);
835 else if(rps_params.GetGapExtend() != rpsDbInfo.gap_extend)
836 NCBI_THROW(CBlastException, eCoreBlastError, "Gap extend penalties do not match");
837
838 if(!rps_params.IsSetGapOpen())
839 rps_params.SetGapOpen(rpsDbInfo.gap_open);
840 else if(rps_params.GetGapOpen() != rpsDbInfo.gap_open)
841 NCBI_THROW(CBlastException, eCoreBlastError, "Gap open penalties do not match");
842
843 if(!rps_params.IsSetMatrixName())
844 rps_params.SetMatrixName (rpsDbInfo.matrix);
845 else if(rps_params.GetMatrixName()!= rpsDbInfo.matrix)
846 NCBI_THROW(CBlastException, eCoreBlastError, "Score matrix does not match");
847
848 return;
849 }
850
851 /* Update the input scoremat with a new PSSM and modified
852 statistics. Scoremat must contain only residue frequencies.
853 Note that upon completion the new PSSM will always have
854 columns of length BLASTAA_SIZE
855 seq is the sequence and set of score frequencies read in
856 from the next data file
857 seq_size is the number of letters in this sequence
858 alphabet_size refers to the number of PSSM rows
859 ScalingFactor is the multiplier for all PSSM scores
860 */
x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo,CPssmWithParameters & seq,Int4 seq_size)861 void CMakeProfileDBApp::x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size )
862 {
863
864 CPssm & pssm = seq.SetPssm();
865 const CPssmParameters & params = seq.GetParams();
866 string matrix_name = params.GetRpsdbparams().GetMatrixName();
867
868 /* Read in the sequence residues from the scoremat structure. */
869 CNCBIstdaa query_stdaa;
870 pssm.GetQuerySequenceData(query_stdaa);
871
872 vector <char> query_v = query_stdaa.Get();
873
874 if((Int4) (query_v.size()) != seq_size)
875 NCBI_THROW(CBlastException, eCoreBlastError, "Query sequence lengths mismatch");
876
877 /* allocate query array and PSSM row array */
878 AutoArray<Uint1> query(seq_size);
879
880 for(unsigned int i = 0; i < query_v.size(); i++)
881 query[i] = query_v[i];
882
883 auto_ptr<CNcbiMatrix <double> > freq_list (CScorematPssmConverter::GetFreqRatios(seq));
884
885 CPsiBlastInputFreqRatios pssm_freq_ratio(query.get(), seq_size, *freq_list,
886 matrix_name.c_str(), rpsDbInfo.gap_open,
887 rpsDbInfo.gap_extend, rpsDbInfo.scale_factor);
888 CPssmEngine pssm_engine(&pssm_freq_ratio);
889 CRef<CPssmWithParameters> out_par(pssm_engine.Run());
890
891 CPssmFinalData & i = pssm.SetFinalData();
892 const CPssmFinalData & o = out_par->GetPssm().GetFinalData();
893 i.SetScores() = o.GetScores();
894 i.SetLambda() = o.GetLambda();
895 i.SetKappa() = o.GetKappa();
896 i.SetH() = o.GetH();
897 i.SetScalingFactor(rpsDbInfo.scale_factor);
898
899 return;
900 }
901
902 /* The first sequence in the list determines several
903 parameters that all other sequences in the list must
904 have. In this case, extra initialization is required
905
906 info contains all the information on data files
907 and parameters from previously added sequences
908 seq is the sequence and PSSM read in from the next data file
909 seq_index refers to the (0-based) position of this sequence
910 in the complete list of seqences
911 seq_size is the number of letters in this sequence
912 alphabet_size refers to the number of PSSM rows
913 */
x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo,CPssmWithParameters & pssm_w_parameters,bool freq_only)914 void CMakeProfileDBApp::x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only )
915 {
916 x_UpdateRPSDbInfo(rpsDbInfo, pssm_w_parameters);
917
918 x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
919 double wordScoreThreshold = m_WordDefaultScoreThreshold;
920
921 if(!freq_only)
922 {
923 if(pssm_w_parameters.GetPssm().GetFinalData().IsSetScalingFactor())
924 {
925 rpsDbInfo.scale_factor = pssm_w_parameters.GetPssm().GetFinalData().GetScalingFactor();
926 }
927 else
928 {
929 // asn1 default value is 1
930 rpsDbInfo.scale_factor = 1.0;
931 }
932 if(m_UseModelThreshold && pssm_w_parameters.GetPssm().GetFinalData().IsSetWordScoreThreshold())
933 {
934 wordScoreThreshold = pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
935 }
936 }
937 else
938 {
939 x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, pssm_w_parameters.GetPssm().GetQueryLength());
940 }
941
942 /* scale up the threshold value and convert to integer */
943 double threshold = rpsDbInfo.scale_factor * wordScoreThreshold;
944
945 /* create BLAST lookup table */
946 if (LookupTableOptionsNew(eBlastTypeBlastp, &(rpsDbInfo.lookup_options)) != 0)
947 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot create lookup options");
948
949 if (BLAST_FillLookupTableOptions(rpsDbInfo.lookup_options, eBlastTypePsiBlast,
950 FALSE, /* no megablast */
951 threshold, /* neighboring threshold */
952 BLAST_WORDSIZE_PROT ) != 0)
953 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot set lookup table options");
954
955 if (BlastAaLookupTableNew(rpsDbInfo.lookup_options, &(rpsDbInfo.lookup)) != 0)
956 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot allocate lookup table");
957
958 rpsDbInfo.lookup->use_pssm = TRUE; /* manually turn on use of PSSMs */
959
960 /* Perform generic query setup */
961
962 if (BlastQuerySetUpOptionsNew(&(rpsDbInfo.query_options)) != 0)
963 NCBI_THROW(CBlastException, eCoreBlastError, "Generic query setup failed");
964
965 if (BLAST_FillQuerySetUpOptions(rpsDbInfo.query_options, eBlastTypeBlastp,
966 NULL, /* no filtering */
967 0 /* strand not applicable */ ) != 0)
968 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot fill query options");
969
970 /* Write the header of the RPS .aux file */
971 rpsDbInfo.aux_file << rpsDbInfo.matrix << "\n";
972 rpsDbInfo.aux_file << rpsDbInfo.gap_open << "\n";
973 rpsDbInfo.aux_file << rpsDbInfo.gap_extend << "\n";
974 rpsDbInfo.aux_file << scientific << 0.0 << "\n";
975 rpsDbInfo.aux_file << scientific << 0.0 << "\n";
976 rpsDbInfo.aux_file << (int) 0 << "\n";
977 rpsDbInfo.aux_file << (int) 0 << "\n";
978 rpsDbInfo.aux_file << fixed << (double) rpsDbInfo.scale_factor << "\n";
979
980 return;
981 }
982
x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo,const CPssmWithParameters & pssm_p,Int4 seq_size)983 void CMakeProfileDBApp::x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size)
984 {
985 const CPssm & pssm = pssm_p.GetPssm();
986 // Update .blocks file
987 const list<CRef<CCoreBlock> > & block_list = pssm_p.GetParams().GetConstraints().GetBlocks();
988
989 list<CRef<CCoreBlock> >::const_iterator itr = block_list.begin();
990
991 int count =0;
992
993 while(itr != block_list.end())
994 {
995 const CCoreBlock & block = (**itr);
996 if(!block.IsSetStart() || !block.IsSetStop())
997 NCBI_THROW(CInputException, eInvalidInput, "No start Or stop found in conserved block");
998
999 string seq_id_str = "id" + NStr::IntToString(count);
1000 if(pssm.IsSetQuery())
1001 {
1002 if(pssm.GetQuery().IsSeq())
1003 {
1004 if(pssm.GetQuery().GetSeq().IsSetDescr())
1005 {
1006 const list<CRef<CSeqdesc> > descr_list= pssm.GetQuery().GetSeq().GetDescr();
1007 if(descr_list.size() > 0)
1008 {
1009 const CRef<CSeqdesc> descr = descr_list.front();
1010 if(descr->IsTitle())
1011 {
1012 string title = descr->GetTitle();
1013 string accession;
1014 string tmp;
1015 if(NStr::SplitInTwo(title, ",", accession, tmp))
1016 seq_id_str = accession;
1017 }
1018 }
1019 }
1020 }
1021 }
1022
1023 rpsDbInfo.blocks_file << seq_id_str << "\t";
1024 rpsDbInfo.blocks_file << count << "\t";
1025 rpsDbInfo.blocks_file << block.GetStart() << "\t";
1026 rpsDbInfo.blocks_file << block.GetStop() << "\n";
1027 count++;
1028 ++itr;
1029 }
1030 return;
1031 }
x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo,const CPssmWithParameters & pssm_p,Int4 seq_index,Int4 seq_size)1032 void CMakeProfileDBApp::x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size)
1033 {
1034 if (!m_UpdateFreqRatios)
1035 return;
1036
1037 const CPssm & pssm = pssm_p.GetPssm();
1038 // Update .freq file
1039 Int4 i = 0;
1040 Int4 j = 0;
1041 Int4 row[BLASTAA_SIZE];
1042 Int4 alphabet_size = pssm.GetNumRows();
1043
1044 const list<double> & freq_ratios = pssm.GetIntermediateData().GetFreqRatios();
1045 list<double>::const_iterator itr_fr = freq_ratios.begin();
1046 rpsDbInfo.freq_file.seekp(0, ios_base::end);
1047
1048 if (pssm.GetByRow() == FALSE) {
1049 for (i = 0; i < seq_size; i++) {
1050 for (j = 0; j < alphabet_size; j++) {
1051 if (itr_fr == freq_ratios.end())
1052 break;
1053 row[j] = (Int4) BLAST_Nint(*itr_fr * FREQ_RATIO_SCALE);
1054 ++itr_fr;
1055 }
1056 for ( ;j < BLASTAA_SIZE; j++) {
1057 row[j] = 0;
1058 }
1059 rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1060 }
1061 }
1062 else {
1063 auto_ptr<CNcbiMatrix<double> > matrix (CScorematPssmConverter::GetFreqRatios(pssm_p));
1064
1065 for (i = 0; i < seq_size; i++) {
1066 for (j = 0; j < BLASTAA_SIZE; j++) {
1067 row[j] = (Int4) BLAST_Nint((*matrix)(i,j ) * FREQ_RATIO_SCALE);
1068 }
1069 rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1070 }
1071 }
1072
1073 memset(row, 0, sizeof(row));
1074 rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1075
1076 rpsDbInfo.freq_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1077 rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1078 return;
1079 }
1080
1081 /* Incrementally update the BLAST lookup table with
1082 words derived from the present sequence
1083 info contains all the information on data files
1084 and parameters from previously added sequences
1085 seq is the sequence and PSSM read in from the next data file
1086 seq_size is the number of letters in this sequence
1087 */
x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo,Int4 seq_size)1088 void CMakeProfileDBApp::x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size)
1089 {
1090 BlastSeqLoc *lookup_segment = NULL;
1091
1092 /* Tell the blast engine to index the entire input
1093 sequence. Since only the PSSM matters for lookup
1094 table creation, the process does not require
1095 actually extracting the sequence data from 'seq'*/
1096
1097 BlastSeqLocNew(&lookup_segment, 0, seq_size - 1);
1098
1099 /* add this sequence to the lookup table. NULL
1100 is passed in place of the query */
1101
1102 Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1103 if (NULL == posMatrix)
1104 NCBI_THROW(CBlastException, eCoreBlastError, "Empty pos matrix");
1105
1106 BlastAaLookupIndexQuery(rpsDbInfo.lookup, posMatrix,
1107 NULL, lookup_segment, rpsDbInfo.curr_seq_offset);
1108
1109 BlastSeqLocFree(lookup_segment);
1110 return;
1111 }
1112
1113 /* Incrementally update the RPS PSSM file with the
1114 PSSM for the next input sequence
1115 info contains all the information on data files
1116 and parameters from previously added sequences
1117 seq is the sequence and PSSM read in from the next data file
1118 seq_index refers to the (0-based) position of this sequence
1119 in the complete list of seqences
1120 seq_size is the number of letters in this sequence
1121 alphabet_size refers to the number of PSSM rows
1122 */
x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo,const CPssm & pssm,Int4 seq_index,Int4 seq_size)1123 void CMakeProfileDBApp::x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size)
1124 {
1125 Int4 i = 0;
1126 Int4 j = 0;
1127
1128 /* Note that RPS blast requires an extra column at
1129 * the end of the PSSM */
1130
1131 list<int>::const_iterator score_list_itr = pssm.GetFinalData().GetScores().begin();
1132 list<int>::const_iterator score_list_end = pssm.GetFinalData().GetScores().end();
1133 Int4 alphabet_size = pssm.GetNumRows();
1134
1135 rpsDbInfo.pos_matrix.Create(seq_size + 1);
1136 Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1137 if (pssm.GetByRow() == FALSE) {
1138 for (i = 0; i < seq_size; i++) {
1139 for (j = 0; j < alphabet_size; j++) {
1140 if (score_list_itr == score_list_end)
1141 break;
1142 posMatrix[i][j] = *score_list_itr;
1143 score_list_itr++;
1144 }
1145 if (j < alphabet_size)
1146 break;
1147 for (; j < BLASTAA_SIZE; j++) {
1148 posMatrix[i][j] = INT2_MIN;
1149 }
1150 }
1151 }
1152 else {
1153 for (j = 0; j < alphabet_size; j++) {
1154 for (i = 0; i < seq_size; i++) {
1155 if (score_list_itr == score_list_end)
1156 break;
1157 posMatrix[i][j] = *score_list_itr;
1158 score_list_itr++;
1159 }
1160 if (i < seq_size)
1161 break;
1162 }
1163 if (j == alphabet_size) {
1164 for (; j < BLASTAA_SIZE; j++) {
1165 for (i = 0; i < seq_size; i++) {
1166 posMatrix[i][j] = INT2_MIN;
1167 }
1168 }
1169 }
1170 }
1171
1172 if (i < seq_size || j < alphabet_size)
1173 NCBI_THROW(CBlastException, eCoreBlastError, "PSSM was truncated early");
1174
1175 if(score_list_itr != score_list_end)
1176 NCBI_THROW(CBlastException, eCoreBlastError, "PSSM too large for this sequence");
1177
1178 /* manually fill in the extra (last) column of the PSSM.
1179 Note that the value to use should more appropriately
1180 be BLAST_SCORE_MIN, but we instead follow the convention
1181 used in copymat */
1182
1183 for (i = 0; i < BLASTAA_SIZE; i++)
1184 posMatrix[seq_size][i] = -BLAST_SCORE_MAX;
1185
1186 /* Dump the score matrix, column by column */
1187 rpsDbInfo.pssm_file.seekp(0, ios_base::end);
1188 for (i = 0; i < seq_size + 1; i++) {
1189 rpsDbInfo.pssm_file.write((const char *) posMatrix[i], sizeof(Int4)*BLASTAA_SIZE);
1190 }
1191 /* Write the next context offset. Note that the
1192 RPSProfileHeader structure is one int too large for
1193 our purposes, so that the index of this sequence
1194 must be decremented to get the right byte offset
1195 into the file */
1196
1197 rpsDbInfo.pssm_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1198 rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1199
1200 return;
1201 }
1202
1203 /* Once all sequences have been processed, perform
1204 final setup on the BLAST lookup table and finish
1205 up the RPS files */
1206
x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo)1207 void CMakeProfileDBApp::x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo)
1208 {
1209 /* Write the last context offset to the PSSM file.
1210 This is the total number of letters for all RPS
1211 DB sequences combined */
1212
1213 rpsDbInfo.pssm_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1214 rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1215 rpsDbInfo.freq_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1216 rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1217
1218 /* Pack the lookup table into its compressed form */
1219 if(NULL == rpsDbInfo.lookup)
1220 NCBI_THROW(CBlastException, eCoreBlastError, "Empty database");
1221
1222 if (BlastAaLookupFinalize(rpsDbInfo.lookup, eBackbone) != 0) {
1223 NCBI_THROW(CBlastException, eCoreBlastError, "Failed to compress lookup table");
1224 }
1225 else {
1226 /* Change the lookup table format to match that
1227 of the legacy BLAST lookup table */
1228
1229 BlastRPSLookupFileHeader header;
1230 BlastAaLookupTable *lut = rpsDbInfo.lookup;
1231 Int4 i, index;
1232 Int4 cursor, old_cursor;
1233 AaLookupBackboneCell *cell;
1234 RPSBackboneCell empty_cell;
1235
1236 memset(&header, 0, sizeof(header));
1237 header.magic_number = RPS_DATABASE_VERSION;
1238
1239 /* for each lookup table cell */
1240
1241 for (index = cursor = 0; index < lut->backbone_size; index++) {
1242 cell = (AaLookupBackboneCell*)lut->thick_backbone + index;
1243
1244
1245 if (cell->num_used == 0)
1246 continue;
1247
1248 /* The cell contains hits */
1249
1250 if (cell->num_used <= RPS_HITS_PER_CELL) {
1251 /* if 3 hits or less, just update each hit offset
1252 to point to the end of the word rather than
1253 the beginning */
1254
1255 for (i = 0; i < cell->num_used; i++)
1256 cell->payload.entries[i] += BLAST_WORDSIZE_PROT - 1;
1257 }
1258 else {
1259 /* if more than 3 hits, pack the first hit into the
1260 lookup table cell, pack the overflow array byte
1261 offset into the cell, and compress the resulting
1262 'hole' in the overflow array. Update the hit
1263 offsets as well */
1264
1265 old_cursor = cell->payload.overflow_cursor;
1266 cell->payload.entries[0] = ((Int4*)lut->overflow)[old_cursor] +
1267 BLAST_WORDSIZE_PROT - 1;
1268 cell->payload.entries[1] = cursor * sizeof(Int4);
1269 for (i = 1; i < cell->num_used; i++, cursor++) {
1270 ((Int4*)lut->overflow)[cursor]
1271 = ((Int4*)lut->overflow)[old_cursor + i] +
1272 BLAST_WORDSIZE_PROT - 1;
1273 }
1274 }
1275 }
1276
1277 header.start_of_backbone = sizeof(header);
1278 header.end_of_overflow = header.start_of_backbone +
1279 (RPS_NUM_LOOKUP_CELLS + 1) * sizeof(RPSBackboneCell) +
1280 cursor * sizeof(Int4);
1281
1282 /* write the lookup file header */
1283
1284 rpsDbInfo.lookup_file.write((const char *)&header, sizeof(header));
1285
1286 /* write the thick backbone */
1287
1288 rpsDbInfo.lookup_file.write((const char *)lut->thick_backbone,
1289 sizeof(RPSBackboneCell)* lut->backbone_size);
1290
1291 /* write extra backbone cells */
1292 memset(&empty_cell, 0, sizeof(empty_cell));
1293 for (i = lut->backbone_size; i < RPS_NUM_LOOKUP_CELLS + 1; i++) {
1294 rpsDbInfo.lookup_file.write((const char *)&empty_cell, sizeof(empty_cell));
1295 }
1296
1297 /* write the new overflow array */
1298 rpsDbInfo.lookup_file.write((const char *)lut->overflow, sizeof(Int4)*cursor);
1299 }
1300
1301 /* Free data, close files */
1302
1303 rpsDbInfo.lookup = BlastAaLookupTableDestruct(rpsDbInfo.lookup);
1304 rpsDbInfo.query_options = BlastQuerySetUpOptionsFree(rpsDbInfo.query_options);
1305 rpsDbInfo.lookup_file.flush();
1306 rpsDbInfo.lookup_file.close();
1307 rpsDbInfo.pssm_file.flush();
1308 rpsDbInfo.pssm_file.close();
1309 rpsDbInfo.aux_file.flush();
1310 rpsDbInfo.aux_file.close();
1311 rpsDbInfo.freq_file.flush();
1312 rpsDbInfo.freq_file.close();
1313
1314 if(op_cobalt == m_op_mode)
1315 {
1316 rpsDbInfo.blocks_file.flush();
1317 rpsDbInfo.blocks_file.close();
1318 }
1319 else if(!m_UpdateFreqRatios)
1320 {
1321 string freq_str = m_OutDbName + ".freq";
1322 CFile(freq_str).Remove();
1323 }
1324
1325 }
1326
Init(void)1327 void CMakeProfileDBApp::Init(void)
1328 {
1329 x_SetupArgDescriptions();
1330 x_InitProgramParameters();
1331 }
1332
s_HasDefline(const CBioseq & bio)1333 static bool s_HasDefline(const CBioseq & bio)
1334 {
1335 if (bio.CanGetDescr()) {
1336 return true;
1337 }
1338
1339 return false;
1340 }
1341
s_GenerateBlastDefline(const CBioseq & bio)1342 static CRef<CBlast_def_line_set> s_GenerateBlastDefline(const CBioseq & bio)
1343 {
1344 CRef<CBlast_def_line_set> defline_set(new CBlast_def_line_set());
1345 CRef<CBlast_def_line> defline(new CBlast_def_line());
1346 defline->SetSeqid() = bio.GetId();
1347 defline_set->Set().push_back(defline);
1348 return defline_set;
1349 }
1350
x_Run(void)1351 int CMakeProfileDBApp::x_Run(void)
1352 {
1353 CBuildDatabase::CreateDirectories(m_OutDbName);
1354 if ( s_DeleteMakeprofileDb(m_OutDbName)) {
1355 *m_LogFile << "Deleted existing BLAST database with identical name." << endl;
1356 }
1357 vector<string> smpFilenames = (op_delta == m_op_mode )? x_CreateDeltaList():x_GetSMPFilenames();
1358 int num_smps = smpFilenames.size();
1359 m_NumOfVols = num_smps/m_MaxSmpFilesPerVol + 1;
1360 int num_seqs = num_smps/m_NumOfVols;
1361 int residue_seqs = num_smps % m_NumOfVols;
1362 if(m_NumOfVols == 1) {
1363 x_MakeVol( -1, smpFilenames);
1364 m_Done = true;
1365 return 0;
1366 }
1367 else {
1368 vector<string>::iterator b = smpFilenames.begin();
1369 vector<string>::iterator r = b + num_seqs;
1370 for(int i=0; i < m_NumOfVols; i++) {
1371 vector<string> vol_smps(b, r);
1372 x_MakeVol(i, vol_smps);
1373 b= r;
1374 r = b + num_seqs;
1375 if(residue_seqs > 0) {
1376 r++;
1377 residue_seqs--;
1378 }
1379 }
1380 _ASSERT(b==smpFilenames.end());
1381 }
1382 if (m_NumOfVols == m_VolNames.size()) {
1383 x_CreateAliasFile();
1384 m_Done = true;
1385 }
1386 return 0;
1387 }
1388
x_MakeVol(Int4 vol,vector<string> & smps)1389 void CMakeProfileDBApp::x_MakeVol(Int4 vol, vector<string> & smps)
1390 {
1391
1392 CRPS_DbInfo rpsDbInfo;
1393 x_InitRPSDbInfo(rpsDbInfo, vol, smps.size());
1394 m_VolNames.push_back(rpsDbInfo.db_name);
1395 x_InitOutputDb(rpsDbInfo);
1396
1397 for(int seq_index=0; seq_index < rpsDbInfo.num_seqs; seq_index++)
1398 {
1399 string filename = smps[seq_index];
1400 CFile f(filename);
1401 if(!f.Exists())
1402 {
1403 string err = filename + " does not exists";
1404 NCBI_THROW(CInputException, eInvalidInput, err);
1405 }
1406
1407 //Read PssmWithParameters from file
1408 CPssmWithParameters pssm_w_parameters;
1409 if(m_binary_scoremat)
1410 {
1411 CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1412 in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1413 }
1414 else
1415 {
1416 CNcbiIfstream in_stream(filename.c_str());
1417 in_stream >> MSerial_AsnText >> pssm_w_parameters;
1418 }
1419
1420 CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1421 // Should have error out already....
1422 if(sm_invalid == sm)
1423 {
1424 string err = filename + " contains invalid scoremat";
1425 NCBI_THROW(CInputException, eInvalidInput, err);
1426 }
1427
1428 const CPssm & pssm = pssm_w_parameters.GetPssm();
1429 int seq_size = pssm.GetQueryLength();
1430
1431 const CBioseq & bioseq = pssm.GetQuery().GetSeq();
1432 CRef<CBlast_def_line_set> deflines;
1433 if(s_HasDefline(bioseq)) {
1434 deflines = CWriteDB::ExtractBioseqDeflines(bioseq);
1435 }
1436 else {
1437 deflines = s_GenerateBlastDefline(bioseq);
1438 }
1439
1440 m_Taxids->FixTaxId(deflines);
1441 rpsDbInfo.output_db->AddSequence(bioseq);
1442 rpsDbInfo.output_db->SetDeflines(*deflines);
1443
1444 //Complete RpsDnInfo init with data from first file
1445 if(NULL == rpsDbInfo.lookup)
1446 {
1447 x_RPSAddFirstSequence( rpsDbInfo, pssm_w_parameters, sm == sm_valid_freq_only);
1448 }
1449 else
1450 {
1451 x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
1452 if(sm_valid_freq_only == sm){
1453 x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, seq_size);
1454 }
1455
1456 if( pssm.GetFinalData().IsSetScalingFactor())
1457 {
1458 if( pssm.GetFinalData().GetScalingFactor() != rpsDbInfo.scale_factor) {
1459 NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1460 }
1461 }
1462 else
1463 {
1464 // If scaling factor not specified, the default is 1
1465 if( 1 != rpsDbInfo.scale_factor) {
1466 NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1467 }
1468 }
1469
1470 if(m_UseModelThreshold && pssm.GetFinalData().IsSetWordScoreThreshold()) {
1471 rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
1472 }
1473 else {
1474 rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * m_WordDefaultScoreThreshold;
1475 }
1476
1477 }
1478
1479 x_RPSUpdatePSSM(rpsDbInfo, pssm, seq_index, seq_size);
1480 x_RPSUpdateLookup(rpsDbInfo, seq_size);
1481 x_UpdateFreqRatios(rpsDbInfo, pssm_w_parameters, seq_index, seq_size);
1482
1483 rpsDbInfo.aux_file << seq_size << "\n";
1484 rpsDbInfo.aux_file << scientific << pssm.GetFinalData().GetKappa() << "\n";
1485 rpsDbInfo.curr_seq_offset +=(seq_size +1);
1486 rpsDbInfo.pos_matrix.Delete();
1487
1488 if(op_cobalt == m_op_mode) {
1489 x_UpdateCobalt(rpsDbInfo, pssm_w_parameters, seq_size);
1490 }
1491 }
1492
1493 if(op_delta == m_op_mode) {
1494 x_UpdateDelta(rpsDbInfo, smps);
1495 }
1496 rpsDbInfo.output_db->Close();
1497 x_RPS_DbClose(rpsDbInfo);
1498 }
1499
s_WriteInt4List(CNcbiOfstream & ostr,const list<Int4> & l)1500 static void s_WriteInt4List(CNcbiOfstream & ostr, const list<Int4> & l)
1501 {
1502 ITERATE(list<Int4>, it, l)
1503 {
1504 ostr.write((char*)&(*it), sizeof(Int4));
1505 }
1506 }
1507
s_WriteUint4List(CNcbiOfstream & ostr,const list<Uint4> & l)1508 static void s_WriteUint4List(CNcbiOfstream & ostr, const list<Uint4> & l)
1509 {
1510 ITERATE(list<Uint4>, it, l)
1511 {
1512 ostr.write((char*)&(*it), sizeof(Uint4));
1513 }
1514 }
1515
x_CreateDeltaList(void)1516 vector<string> CMakeProfileDBApp::x_CreateDeltaList(void)
1517 {
1518 vector<string> smpFilenames = x_GetSMPFilenames();
1519 vector<string> deltaList;
1520
1521 for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1522 {
1523 string filename = smpFilenames[seq_index];
1524 CFile f(filename);
1525 if(!f.Exists())
1526 {
1527 string err = filename + " does not exists";
1528 NCBI_THROW(CInputException, eInvalidInput, err);
1529 }
1530
1531 //Read PssmWithParameters from file
1532 CPssmWithParameters pssm_w_parameters;
1533 if(m_binary_scoremat)
1534 {
1535 CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1536 in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1537 }
1538 else
1539 {
1540 CNcbiIfstream in_stream(filename.c_str());
1541 in_stream >> MSerial_AsnText >> pssm_w_parameters;
1542 }
1543
1544 CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1545 // Should have error out already....
1546 if(sm_invalid == sm)
1547 {
1548 string err = filename + " contains invalid scoremat";
1549 NCBI_THROW(CInputException, eInvalidInput, err);
1550 }
1551
1552 const CPssm & pssm = pssm_w_parameters.GetPssm();
1553 int seq_size = pssm.GetQueryLength();
1554 if(!pssm.IsSetIntermediateData()|| !pssm.GetIntermediateData().IsSetWeightedResFreqsPerPos())
1555 {
1556 string err = filename + " contains no weighted residue frequencies for building delta database";
1557 NCBI_THROW(CInputException, eInvalidInput, err);
1558 }
1559
1560 if(!pssm.GetIntermediateData().IsSetNumIndeptObsr())
1561 {
1562 string err = filename + " contains no observations information for building delta database";
1563 NCBI_THROW(CInputException, eInvalidInput, err);
1564 }
1565
1566 if (true == x_CheckDelta(pssm, seq_size, filename))
1567 {
1568 deltaList.push_back(filename);
1569 }
1570 }
1571
1572 return deltaList;
1573 }
1574
x_UpdateDelta(CRPS_DbInfo & rpsDbInfo,vector<string> & smpFilenames)1575 void CMakeProfileDBApp::x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames)
1576 {
1577 CTmpFile tmp_obsr_file(CTmpFile::eRemove);
1578 CTmpFile tmp_freq_file(CTmpFile::eRemove);
1579 CNcbiOfstream tmp_obsr_buff(tmp_obsr_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1580 CNcbiOfstream tmp_freq_buff(tmp_freq_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1581
1582 list<Int4> FreqOffsets;
1583 list<Int4> ObsrOffsets;
1584 Int4 CurrFreqOffset = 0;
1585 Int4 CurrObsrOffset= 0;
1586
1587 for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1588 {
1589 string filename = smpFilenames[seq_index];
1590 //Read PssmWithParameters from file
1591 CPssmWithParameters pssm_w_parameters;
1592 if(m_binary_scoremat)
1593 {
1594 CNcbiIfstream in_stream(filename.c_str(), ios::binary);
1595 in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1596 }
1597 else
1598 {
1599 CNcbiIfstream in_stream(filename.c_str());
1600 in_stream >> MSerial_AsnText >> pssm_w_parameters;
1601 }
1602
1603 const CPssm & pssm = pssm_w_parameters.GetPssm();
1604 int seq_size = pssm.GetQueryLength();
1605
1606 // get weightd residue frequencies
1607 const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1608
1609 // get number of independent observations
1610 const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1611
1612 int alphabet_size = pssm.GetNumRows();
1613 list<double> modify_freqs;
1614
1615 if(pssm.GetByRow())
1616 {
1617 // need to flip the freq matrix
1618 vector<double> tmp(orig_freqs.size());
1619 list<double>::const_iterator f_itr = orig_freqs.begin();
1620
1621 for(int i = 0; i < alphabet_size; i++)
1622 {
1623 for(int j = 0; j < seq_size; j++)
1624 {
1625 tmp[i + j*alphabet_size] = *f_itr;
1626 ++f_itr;
1627 }
1628 }
1629 copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1630 }
1631
1632 // Pad matrix if necessary
1633 if(alphabet_size < BLASTAA_SIZE)
1634 {
1635 if(0 == modify_freqs.size())
1636 copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1637
1638 list<double>::iterator p_itr = modify_freqs.begin();
1639
1640 for (int j=0; j < seq_size; j++)
1641 {
1642 for(int i=0; i < alphabet_size; i++)
1643 {
1644 if(modify_freqs.end() == p_itr)
1645 break;
1646
1647 ++p_itr;
1648 }
1649
1650 modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1651 }
1652 }
1653
1654 const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1655
1656 //save offset for this record
1657 ObsrOffsets.push_back(CurrObsrOffset);
1658
1659 list<Uint4> ObsrBuff;
1660 // write effective observations in compressed form
1661 // as a list of pairs: value, number of occurences
1662 unsigned int num_obsr_columns = 0;
1663 list<double>::const_iterator obsr_it = obsr.begin();
1664 do
1665 {
1666 double current = *obsr_it;
1667 Uint4 num = 1;
1668 num_obsr_columns++;
1669 obsr_it++;
1670 while (obsr_it != obsr.end() && fabs(*obsr_it - current) < 1e-4)
1671 {
1672 obsr_it++;
1673 num++;
1674 num_obsr_columns++;
1675 }
1676
1677 // +1 because pssm engine returns alpha (in psi-blast papers)
1678 // which is number of independent observations - 1
1679 ObsrBuff.push_back((Uint4)((current + 1.0) * kFixedPointScaleFactor));
1680 ObsrBuff.push_back(num);
1681 }
1682 while (obsr_it != obsr.end());
1683
1684 Uint4 num_weighted_counts = 0;
1685
1686 // save offset for this frequencies record
1687 FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1688
1689 list<Uint4> FreqBuff;
1690 // save weighted residue frequencies
1691 ITERATE (list<double>, it, freqs)
1692 {
1693 FreqBuff.push_back((Uint4)(*it * kFixedPointScaleFactor));
1694 num_weighted_counts++;
1695 }
1696
1697 if (num_obsr_columns != num_weighted_counts / BLASTAA_SIZE)
1698 {
1699 string err = "Number of frequencies and observations columns do not match in " + filename;
1700 NCBI_THROW(CException, eInvalid, err);
1701 }
1702
1703 // additional column of zeros is added for compatibility with rps database
1704 unsigned int padded_size = FreqBuff.size() + BLASTAA_SIZE;
1705 FreqBuff.resize(padded_size, 0);
1706
1707 CurrFreqOffset += FreqBuff.size();
1708 CurrObsrOffset += ObsrBuff.size();
1709 s_WriteUint4List(tmp_freq_buff, FreqBuff);
1710 s_WriteUint4List(tmp_obsr_buff, ObsrBuff);
1711
1712 }
1713
1714 tmp_obsr_buff.flush();
1715 tmp_freq_buff.flush();
1716 x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1717 }
1718
1719
x_ValidateCd(const list<double> & freqs,const list<double> & observ,unsigned int alphabet_size)1720 bool CMakeProfileDBApp::x_ValidateCd(const list<double>& freqs,
1721 const list<double>& observ,
1722 unsigned int alphabet_size)
1723 {
1724
1725 if (freqs.size() / alphabet_size != observ.size())
1726 {
1727 string err = "Number of frequency and observations columns do not match";
1728 NCBI_THROW(CException, eInvalid, err);
1729 }
1730
1731 ITERATE (list<double>, it, freqs)
1732 {
1733 unsigned int residue = 0;
1734 double sum = 0.0;
1735 while (residue < alphabet_size - 1)
1736 {
1737 sum += *it;
1738 it++;
1739 residue++;
1740 }
1741 sum += *it;
1742
1743 if (fabs(sum - 1.0) > kEpsylon)
1744 return false;
1745 }
1746
1747 ITERATE (list<double>, it, observ)
1748 {
1749 if (*it < 1.0)
1750 return false;
1751 }
1752
1753 return true;
1754 }
1755
1756
x_CheckDelta(const CPssm & pssm,Int4 seq_size,const string & filename)1757 bool CMakeProfileDBApp::x_CheckDelta( const CPssm & pssm, Int4 seq_size, const string & filename)
1758 {
1759 // get weightd residue frequencies
1760 const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1761
1762 // get number of independent observations
1763 const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1764
1765 int alphabet_size = pssm.GetNumRows();
1766 list<double> modify_freqs;
1767
1768 if(pssm.GetByRow())
1769 {
1770 // need to flip the freq matrix
1771 vector<double> tmp(orig_freqs.size());
1772 list<double>::const_iterator f_itr = orig_freqs.begin();
1773
1774 for(int i = 0; i < alphabet_size; i++)
1775 {
1776 for(int j = 0; j < seq_size; j++)
1777 {
1778 tmp[i + j*alphabet_size] = *f_itr;
1779 ++f_itr;
1780 }
1781 }
1782 copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1783 }
1784
1785 // Pad matrix if necessary
1786 if(alphabet_size < BLASTAA_SIZE)
1787 {
1788 if(0 == modify_freqs.size())
1789 copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1790
1791 list<double>::iterator p_itr = modify_freqs.begin();
1792
1793 for (int j=0; j < seq_size; j++)
1794 {
1795 for(int i=0; i < alphabet_size; i++)
1796 {
1797 if(modify_freqs.end() == p_itr)
1798 break;
1799
1800 ++p_itr;
1801 }
1802
1803 modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1804 }
1805 }
1806
1807 const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1808 double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1809 if(max_obsr < m_ObsrvThreshold)
1810 {
1811 *m_LogFile << filename +
1812 " was excluded: due to too few independent observations\n";
1813 return false;
1814 }
1815
1816 if( !x_ValidateCd(freqs, obsr, BLASTAA_SIZE) && m_ExcludeInvalid)
1817 {
1818 *m_LogFile << filename +
1819 " was excluded: it conatins an invalid CD \n";
1820 return false;
1821 }
1822 return true;
1823 }
1824
1825
1826
x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo,CTmpFile & tmp_obsr_file,CTmpFile & tmp_freq_file,list<Int4> & FreqOffsets,list<Int4> & ObsrOffsets,Int4 CurrFreqOffset,Int4 CurrObsrOffset)1827 void CMakeProfileDBApp::x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
1828 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
1829 {
1830 FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1831 ObsrOffsets.push_back(CurrObsrOffset);
1832
1833 string wcounts_str = rpsDbInfo.db_name + ".wcounts";
1834 CNcbiOfstream wcounts_file(wcounts_str.c_str(), ios::out | ios::binary);
1835 if (!wcounts_file.is_open())
1836 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .wcounts file");
1837
1838 string obsr_str = rpsDbInfo.db_name + ".obsr";
1839 CNcbiOfstream obsr_file(obsr_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
1840 if (!obsr_file.is_open())
1841 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .obsr file");
1842
1843 CNcbiIfstream tmp_obsr_buff (tmp_obsr_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1844 CNcbiIfstream tmp_freq_buff (tmp_freq_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1845
1846 // write RPS BLAST database magic number
1847 Int4 magic_number = RPS_MAGIC_NUM_28;
1848 wcounts_file.write((char*)&magic_number, sizeof(Int4));
1849 obsr_file.write((char*)&magic_number, sizeof(Int4));
1850
1851 // write number of recrods
1852 Int4 num_wcounts_records = FreqOffsets.size() -1;
1853 Int4 num_obsr_records = ObsrOffsets.size() -1;
1854 wcounts_file.write((char*)&num_wcounts_records, sizeof(Int4));
1855 obsr_file.write((char*)&num_obsr_records, sizeof(Int4));
1856
1857 s_WriteInt4List(wcounts_file, FreqOffsets);
1858 wcounts_file.flush();
1859 wcounts_file << tmp_freq_buff.rdbuf();
1860 wcounts_file.flush();
1861 wcounts_file.close();
1862
1863 s_WriteInt4List(obsr_file, ObsrOffsets);
1864 obsr_file.flush();
1865 obsr_file << tmp_obsr_buff.rdbuf();
1866 obsr_file.flush();
1867 obsr_file.close();
1868 }
1869
x_CreateAliasFile(void)1870 void CMakeProfileDBApp::x_CreateAliasFile(void)
1871 {
1872 vector<string> v;
1873 for(unsigned int i=0; i < m_VolNames.size(); i++) {
1874 string t = kEmptyStr;
1875 CSeqDB_Substring s = SeqDB_RemoveDirName(CSeqDB_Substring( m_VolNames[i]));
1876 s.GetString(t);
1877 v.push_back(t);
1878 }
1879 CWriteDB_CreateAliasFile(m_OutDbName , v,
1880 CWriteDB::eProtein, kEmptyStr, m_Title, eNoAliasFilterType);
1881 }
1882
Run(void)1883 int CMakeProfileDBApp::Run(void)
1884 {
1885 int status = 0;
1886 try { x_Run(); }
1887 catch(const blast::CInputException& e) {
1888 ERR_POST(Error << "INPUT ERROR: " << e.GetMsg());
1889 status = BLAST_INPUT_ERROR;
1890 }
1891 catch (const CSeqDBException& e) {
1892 ERR_POST(Error << "ERROR: " << e.GetMsg());
1893 status = BLAST_DATABASE_ERROR;
1894 }
1895 catch (const blast::CBlastException& e) {
1896 ERR_POST(Error << "ERROR: " << e.GetMsg());
1897 status = BLAST_INPUT_ERROR;
1898 }
1899 catch (const CException& e) {
1900 ERR_POST(Error << "ERROR: " << e.GetMsg());
1901 status = BLAST_UNKNOWN_ERROR;
1902 }
1903 catch (...) {
1904 ERR_POST(Error << "Error: Unknown exception");
1905 status = BLAST_UNKNOWN_ERROR;
1906 }
1907
1908 x_AddCmdOptions();
1909 m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, status);
1910 return status;
1911 }
1912
x_AddCmdOptions(void)1913 void CMakeProfileDBApp::x_AddCmdOptions(void)
1914 {
1915 const CArgs & args = GetArgs();
1916 if (args["dbtype"].HasValue()) {
1917 m_UsageReport.AddParam(CBlastUsageReport::eDBType, args["dbtype"].AsString());
1918 }
1919 if(args["taxid"].HasValue() || args["taxid_map"].HasValue()) {
1920 m_UsageReport.AddParam(CBlastUsageReport::eTaxIdList, true);
1921 }
1922 }
1923
1924
1925 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])1926 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1927 {
1928 return CMakeProfileDBApp().AppMain(argc, argv);
1929 }
1930
1931
1932
1933
1934 #endif /* SKIP_DOXYGEN_PROCESSING */
1935
1936