1 /*  $Id: makeprofiledb.cpp 631568 2021-05-19 13:53:50Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author: Amelia Fong
27  *
28  */
29 
30 /** @file makeprofiledb.cpp
31  * Command line tool to create RPS,COBALT & DELTA BLAST databases.
32  * This is the successor to formatrpsdb from the C toolkit
33  */
34 
35 #include <ncbi_pch.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbimisc.hpp>
38 #include <corelib/ncbitime.hpp>
39 #include <util/math/matrix.hpp>
40 #include <serial/objistrasn.hpp>
41 #include <algo/blast/api/version.hpp>
42 #include <algo/blast/blastinput/cmdline_flags.hpp>
43 #include <algo/blast/blastinput/blast_input.hpp>
44 #include <algo/blast/api/pssm_engine.hpp>
45 #include <algo/blast/api/psi_pssm_input.hpp>
46 #include <objects/scoremat/PssmWithParameters.hpp>
47 #include <objects/scoremat/Pssm.hpp>
48 #include <objects/scoremat/PssmIntermediateData.hpp>
49 #include <objects/scoremat/PssmParameters.hpp>
50 #include <objects/scoremat/FormatRpsDbParameters.hpp>
51 #include <objects/scoremat/PssmFinalData.hpp>
52 #include <objects/scoremat/CoreBlock.hpp>
53 #include <objects/scoremat/CoreDef.hpp>
54 #include <objects/scoremat/LoopConstraint.hpp>
55 #include <algo/blast/core/blast_aalookup.h>
56 #include <algo/blast/core/blast_options.h>
57 #include <algo/blast/core/ncbi_math.h>
58 #include <objtools/blast/seqdb_writer/writedb.hpp>
59 #include <objtools/blast/seqdb_writer/build_db.hpp>
60 #include <objtools/blast/seqdb_writer/taxid_set.hpp>
61 #include <objtools/blast/seqdb_writer/writedb_files.hpp>
62 #include <objtools/blast/seqdb_reader/impl/seqdbgeneral.hpp>
63 #include "../blast/blast_app_util.hpp"
64 
65 #ifndef SKIP_DOXYGEN_PROCESSING
66 USING_NCBI_SCOPE;
67 USING_SCOPE(blast);
68 USING_SCOPE(objects);
69 #endif /* SKIP_DOXYGEN_PROCESSING */
70 
71 
72 //Input args specify to makeprofiledb
73 static const string kInPssmList("in");
74 static const string kOutDbName("out");
75 static const string kOutDbType("dbtype");
76 static const string kPssmScaleFactor("scale");
77 static const string kOutIndexFile("index");
78 static const string kObsrThreshold("obsr_threshold");
79 static const string kExcludeInvalid("exclude_invalid");
80 static const string kBinaryScoremat("binary");
81 static const string kUseCmdlineThreshold("force");
82 static const string kMaxSmpFilesPerVol("max_smp_vol");
83 
84 static const string kLogFile("logfile");
85 
86 //Supported Output Database Types
87 static const string kOutDbRps = "rps";
88 static const string kOutDbCobalt = "cobalt";
89 static const string kOutDbDelta = "delta";
90 
91 //Supported Matrices
92 static const string kMatrixBLOSUM62 = "BLOSUM62";
93 static const string kMatrixBLOSUM80 = "BLOSUM80";
94 static const string kMatrixBLOSUM50 = "BLOSUM50";
95 static const string kMatrixBLOSUM45 = "BLOSUM45";
96 static const string kMatrixBLOSUM90 = "BLOSUM90";
97 static const string kMatrixPAM250 = "PAM250";
98 static const string kMatrixPAM30 = "PAM30";
99 static const string kMatrixPAM70 = "PAM70";
100 
101 //Default Input Values
102 static const string kDefaultMatrix(kMatrixBLOSUM62);
103 static const string kDefaultOutDbType(kOutDbRps);
104 static const string kDefaultOutIndexFile("true");
105 static const string kDefaultExcludeInvalid("true");
106 #define kDefaultWordScoreThreshold (9.82)
107 #define kDefaultPssmScaleFactor (100.00)
108 #define kDefaultObsrThreshold (6.0)
109 #define kDefaultMaxSmpFilesPerVol (2500)
110 
111 //Fix point scale factor for delta blast
112 static const Uint4 kFixedPointScaleFactor = 1000;
113 #define kEpsylon (0.0001)
114 
115 #define DEFAULT_POS_MATRIX_SIZE	2000
116 #define RPS_NUM_LOOKUP_CELLS 32768
117 #if BLASTAA_SIZE == 28
118 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM_28
119 #else
120 #define RPS_DATABASE_VERSION RPS_MAGIC_NUM
121 #endif
122 
123 #define kSingleVol (-1)
124 
125 class CMakeDbPosMatrix
126 {
127 public:
CMakeDbPosMatrix()128 	CMakeDbPosMatrix():  m_posMatrix(NULL), m_size(0) { };
~CMakeDbPosMatrix()129 	~CMakeDbPosMatrix(){Delete();};
130 
131 	void Create(int seq_size);
132 	void Delete(void);
133 
Get(void)134 	Int4 ** Get(void) { return m_posMatrix;};
GetSize(void)135 	unsigned int GetSize(void){return m_size;};
136 
137 private:
138 
139 	Int4 ** m_posMatrix;
140 	int m_size;
141 };
142 
Create(int size)143 void CMakeDbPosMatrix::Create(int size)
144 {
145 	Delete();
146 
147 	m_posMatrix = new Int4* [size];
148 
149 	for(int i = 0; i < size; ++ i)
150 	{
151 		m_posMatrix[i] = new Int4[BLASTAA_SIZE];
152 	}
153 	m_size = size;
154 
155 	return;
156 }
157 
Delete(void)158 void CMakeDbPosMatrix::Delete(void)
159 {
160 	if( NULL == m_posMatrix)
161 		return;
162 
163 	for(int i = 0; i < m_size; ++ i)
164 	{
165 		if (m_posMatrix[i] != NULL)
166 			delete [] m_posMatrix[i];
167 	}
168 
169 	delete [] m_posMatrix;
170 	m_posMatrix = NULL;
171 	return;
172 }
173 
174 class CMakeProfileDBApp : public CNcbiApplication
175 {
176 public:
177     /** @inheritDoc */
178     CMakeProfileDBApp(void);
179     ~CMakeProfileDBApp();
180 private:
181     /** @inheritDoc */
182     virtual void Init();
183     /** @inheritDoc */
184     virtual int Run();
185 
186     enum op_mode
187     {
188     	op_rps ,
189     	op_cobalt,
190     	op_delta ,
191     	op_invalid
192     };
193 
194     class CRPS_DbInfo
195     {
196     public:
197         string db_name;
198         Int4 num_seqs;
199         CNcbiOfstream lookup_file;
200         CNcbiOfstream pssm_file;
201         CNcbiOfstream aux_file;
202         CNcbiOfstream blocks_file;
203         CNcbiOfstream freq_file;
204 
205         CMakeDbPosMatrix pos_matrix;
206         Int4 gap_open;
207         Int4 gap_extend;
208         Int4 scale_factor;
209         Int4 curr_seq_offset;
210         QuerySetUpOptions *query_options;
211         LookupTableOptions *lookup_options;
212         BlastAaLookupTable *lookup;
213         string matrix;
214         CRef<CWriteDB>	output_db;
215 
CRPS_DbInfo(void)216         CRPS_DbInfo(void):
217         		db_name(kEmptyStr), num_seqs(0),
218         		gap_open(0), gap_extend(0),scale_factor(0), curr_seq_offset(0),
219         		query_options(NULL), lookup_options(NULL), lookup(NULL)
220         { };
~CRPS_DbInfo()221         ~CRPS_DbInfo()
222         {
223         	if( NULL != query_options) {
224         		BlastQuerySetUpOptionsFree(query_options);
225         	}
226 
227         	if(NULL != lookup) {
228         		BlastAaLookupTableDestruct(lookup);
229         	}
230 
231         	if(NULL != lookup_options) {
232         		LookupTableOptionsFree(lookup_options);
233         	}
234         };
235     };
236 
237     enum CheckInputScoremat_RV
238     {
239     	sm_valid_has_pssm,
240     	sm_valid_freq_only,
241      	sm_invalid
242     };
243 
244     enum
245     {
246     	eUndefined = -1,
247     	eFalse,
248     	eTrue
249     };
250 
251     CheckInputScoremat_RV x_CheckInputScoremat(const CPssmWithParameters & pssm_w_parameters,
252     										   const string & filename);
253     void x_SetupArgDescriptions(void);
254     void x_InitProgramParameters(void);
255     vector<string> x_GetSMPFilenames(void);
256     void x_InitOutputDb(CRPS_DbInfo & rpsDBInfo);
257     void x_InitRPSDbInfo(CRPS_DbInfo & rpsDBInfo, Int4 vol, Int4 num_files);
258     void x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p);
259     void x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters  & pssm_w_parameters, bool freq_only);
260     void x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size);
261     void x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size);
262     void x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_p);
263     void x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size);
264     void x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo);
265     void x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters  & pssm_p, Int4 seq_size);
266     bool x_CheckDelta( const CPssm  & pssm, Int4 seq_size, const string & filename);
267     bool x_ValidateCd(const list<double>& freqs, const list<double>& observ, unsigned int alphabet_size);
268     void x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
269     		 list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset);
270     vector<string> x_CreateDeltaList(void);
271     void x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size);
272     void x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames);
273     bool x_IsUpdateFreqRatios(const CPssm & p);
274     void x_MakeVol(Int4 vol, vector<string> & smps);
275 
276     int x_Run(void);
277 
278     void x_AddCmdOptions(void);
279     void x_CreateAliasFile(void);
280 
281     // Data
282     CNcbiOstream * m_LogFile;
283     CNcbiIstream * m_InPssmList;
284     string m_Title;
285     double m_WordDefaultScoreThreshold;
286     string m_OutDbName;
287     string m_OutDbType;
288     bool m_CreateIndexFile;
289     int m_GapOpenPenalty;
290     int m_GapExtPenalty;
291     double m_PssmScaleFactor;
292     string m_Matrix;
293     op_mode m_op_mode;
294     bool m_binary_scoremat;
295     int m_MaxSmpFilesPerVol;
296     int m_NumOfVols;
297 
298     EBlastDbVersion m_DbVer;
299     CRef<CTaxIdSet> m_Taxids;
300     bool m_Done;
301 
302     //For Delta Blast
303 	double m_ObsrvThreshold;
304 	bool m_ExcludeInvalid;
305 
306 	int m_UpdateFreqRatios;
307 	bool m_UseModelThreshold;
308 
309 	vector<string> m_VolNames;
310     CBlastUsageReport m_UsageReport;
311     CStopWatch m_StopWatch;
312 };
313 
CMakeProfileDBApp(void)314 CMakeProfileDBApp::CMakeProfileDBApp(void)
315                 : m_LogFile(NULL), m_InPssmList(NULL), m_Title(kEmptyStr),
316                   m_WordDefaultScoreThreshold(0), m_OutDbName(kEmptyStr),
317                   m_OutDbType(kEmptyStr), m_CreateIndexFile(false),m_GapOpenPenalty(0),
318                   m_GapExtPenalty(0), m_PssmScaleFactor(0),m_Matrix(kEmptyStr),  m_op_mode(op_invalid),
319                   m_binary_scoremat(false), m_MaxSmpFilesPerVol(0), m_NumOfVols(0), m_DbVer(eBDB_Version5),
320                   m_Taxids(new CTaxIdSet()), m_Done(false),
321                   m_ObsrvThreshold(0), m_ExcludeInvalid(false),
322                   m_UpdateFreqRatios(eUndefined), m_UseModelThreshold(true)
323 {
324 	CRef<CVersion> version(new CVersion());
325 	version->SetVersionInfo(new CBlastVersion());
326 	SetFullVersion(version);
327     m_StopWatch.Start();
328     if (m_UsageReport.IsEnabled()) {
329     	m_UsageReport.AddParam(CBlastUsageReport::eVersion, GetVersion().Print());
330     	m_UsageReport.AddParam(CBlastUsageReport::eProgram, (string) "makeprofiledb");
331     }
332 }
333 
~CMakeProfileDBApp()334 CMakeProfileDBApp::~CMakeProfileDBApp()
335 {
336 	// NEED CLEAN UP CODE !!!!
337 	 if(m_Done == false)
338 	 {
339 		 for(unsigned int i =0; i < m_VolNames.size(); i ++)
340 	 	 {
341 			string rps_str = m_VolNames[i] + ".rps";
342 		 	string lookup_str = m_VolNames[i] + ".loo";
343 		 	string aux_str = m_VolNames[i] + ".aux";
344 		 	string freq_str = m_VolNames[i] + ".freq";
345 		 	CFile(rps_str).Remove();
346 		 	CFile(lookup_str).Remove();
347 		 	CFile(aux_str).Remove();
348 		 	CFile(freq_str).Remove();
349 
350 		 	if(op_cobalt == m_op_mode)
351 		 	{
352 				string blocks_str = m_VolNames[i] + ".blocks";
353 		 		CFile(blocks_str).Remove();
354 		 	}
355 
356 		 	if(op_delta == m_op_mode)
357 		 	{
358 		 		string wcounts_str = m_VolNames[i] + ".wcounts";
359 		 		string obsr_str = m_VolNames[i] + ".obsr";
360 		 		CFile(wcounts_str).Remove();
361 		 		CFile(obsr_str).Remove();
362 		 	}
363 	 	 }
364 		 if (m_VolNames.size() > 1) {
365 			 string pal_str = m_OutDbName + ".pal";
366 			 CFile(pal_str).Remove();
367 		 }
368 	 }
369 	 else
370 	 {
371 		 for(unsigned int i =0; i < m_VolNames.size(); i ++) {
372 			 string pog_str = m_VolNames[i] + ".pog";
373 			 CFile(pog_str).Remove();
374 		 }
375 	 }
376 	 m_UsageReport.AddParam(CBlastUsageReport::eRunTime, m_StopWatch.Elapsed());
377 }
378 
x_SetupArgDescriptions(void)379 void CMakeProfileDBApp::x_SetupArgDescriptions(void)
380 {
381     HideStdArgs(fHideConffile | fHideFullVersion | fHideXmlHelp | fHideDryRun);
382 
383     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
384 
385     // Specify USAGE context
386     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
387                   "Application to create databases for rpsblast, cobalt and deltablast, version "
388                   + CBlastVersion().Print());
389 
390     string dflt("Default = input file name provided to -");
391     dflt += kInPssmList + " argument";
392 
393     arg_desc->SetCurrentGroup("Input options");
394     arg_desc->AddKey(kInPssmList, "in_pssm_list",
395                      "Input file that contains a list of smp files (delimited by space, tab or newline)",
396                      CArgDescriptions::eInputFile);
397 
398     arg_desc->AddFlag(kBinaryScoremat,
399        				  "Scoremats are in binary format",
400        				  true);
401 
402     arg_desc->SetCurrentGroup("Configuration options");
403     arg_desc->AddOptionalKey(kArgDbTitle, "database_title",
404                              "Title for database\n" + dflt,
405                              CArgDescriptions::eString);
406 
407     arg_desc->AddDefaultKey(kArgWordScoreThreshold, "word_score_threshold",
408     						"Minimum word score to add a word to the lookup table",
409     						CArgDescriptions::eDouble,
410     						NStr::DoubleToString(kDefaultWordScoreThreshold));
411     arg_desc->AddFlag(kUseCmdlineThreshold, "Use cmdline threshold", true);
412 
413     arg_desc->SetCurrentGroup("Output options");
414     arg_desc->AddOptionalKey(kOutDbName, "database_name",
415                              "Name of database to be created\n" +
416                               dflt , CArgDescriptions::eString);
417 
418     arg_desc->AddDefaultKey("blastdb_version", "version",
419                              "Version of BLAST database to be created",
420                              CArgDescriptions::eInteger,
421                              NStr::NumericToString(static_cast<int>(eBDB_Version5)));
422     arg_desc->SetConstraint("blastdb_version",
423                             new CArgAllow_Integers(eBDB_Version4, eBDB_Version5));
424 
425     arg_desc->AddDefaultKey(kMaxSmpFilesPerVol, "max_smp_files_per_vol",
426                             "Maximum number of SMP files per DB volume",
427                             CArgDescriptions::eInteger, NStr::IntToString(kDefaultMaxSmpFilesPerVol));
428 
429     arg_desc->AddDefaultKey(kOutDbType, "output_db_type",
430                             "Output database type: cobalt, delta, rps",
431                             CArgDescriptions::eString, kDefaultOutDbType);
432     arg_desc->SetConstraint(kOutDbType, &(*new CArgAllow_Strings, kOutDbRps, kOutDbCobalt , kOutDbDelta ));
433 
434     arg_desc->AddDefaultKey(kOutIndexFile, "create_index_files",
435                             "Create Index Files",
436                             CArgDescriptions::eBoolean, kDefaultOutIndexFile);
437 
438     arg_desc->SetCurrentGroup("Used only if scoremat files do not contain PSSM scores, ignored otherwise.");
439     arg_desc->AddOptionalKey(kArgGapOpen, "gap_open_penalty",
440                             "Cost to open a gap",
441                             CArgDescriptions::eInteger);
442 
443     arg_desc->AddOptionalKey(kArgGapExtend, "gap_extend_penalty",
444                             "Cost to extend a gap, ",
445                             CArgDescriptions::eInteger);
446 
447     arg_desc->AddDefaultKey(kPssmScaleFactor, "pssm_scale_factor",
448                             "Pssm Scale factor ",
449                             CArgDescriptions::eDouble,
450                             NStr::DoubleToString(kDefaultPssmScaleFactor));
451 
452     arg_desc->AddDefaultKey(kArgMatrixName, "matrix_name",
453                             "Scoring matrix name",
454                             CArgDescriptions::eString,
455                             kDefaultMatrix);
456     arg_desc->SetConstraint(kArgMatrixName, &(*new CArgAllow_Strings,kMatrixBLOSUM62, kMatrixBLOSUM80,
457     						kMatrixBLOSUM50, kMatrixBLOSUM45, kMatrixBLOSUM90, kMatrixPAM250, kMatrixPAM30, kMatrixPAM70));
458 
459     //Delta Blast Options
460     arg_desc->SetCurrentGroup("Delta Blast Options");
461     arg_desc->AddDefaultKey(kObsrThreshold, "observations_threshold", "Exclude domains with "
462                             "with maximum number of independent observations "
463                             "below this threshold", CArgDescriptions::eDouble,
464                             NStr::DoubleToString(kDefaultObsrThreshold));
465 
466     arg_desc->AddDefaultKey(kExcludeInvalid, "exclude_invalid", "Exclude domains that do "
467                             "not pass validation test",
468                             CArgDescriptions::eBoolean, kDefaultExcludeInvalid);
469 
470     arg_desc->SetCurrentGroup("Taxonomy options");
471     arg_desc->AddOptionalKey("taxid", "TaxID",
472                              "Taxonomy ID to assign to all sequences",
473                              CArgDescriptions::eInteger);
474     arg_desc->SetConstraint("taxid", new CArgAllowValuesGreaterThanOrEqual(0));
475     arg_desc->SetDependency("taxid", CArgDescriptions::eExcludes, "taxid_map");
476 
477     arg_desc->AddOptionalKey("taxid_map", "TaxIDMapFile",
478              "Text file mapping sequence IDs to taxonomy IDs.\n"
479              "Format:<SequenceId> <TaxonomyId><newline>",
480              CArgDescriptions::eInputFile);
481 
482     SetupArgDescriptions(arg_desc.release());
483 }
484 
x_InitProgramParameters(void)485 void CMakeProfileDBApp::x_InitProgramParameters(void)
486 {
487 	const CArgs& args = GetArgs();
488 
489 	//log_file
490 	if (args[kLogFile].HasValue())
491 		m_LogFile = &args[kLogFile].AsOutputFile();
492 	else
493 		m_LogFile = &cout;
494 
495 
496 	//in_list
497 	if (args[kInPssmList].HasValue())
498 		m_InPssmList = &args[kInPssmList].AsInputFile();
499 	else
500 		NCBI_THROW(CInputException, eInvalidInput,  "Please provide an input file with list of smp files");
501 
502 	// Binary Scoremat
503 	m_binary_scoremat = args[kBinaryScoremat];
504 
505 	//title
506 	if (args[kArgDbTitle].HasValue())
507 		m_Title = args[kArgDbTitle].AsString();
508 	else
509 		m_Title = args[kInPssmList].AsString();
510 
511 	//threshold
512 	m_WordDefaultScoreThreshold = args[kArgWordScoreThreshold].AsDouble();
513 
514 	//Out
515 	if(args[kOutDbName].HasValue())
516 		m_OutDbName = args[kOutDbName].AsString();
517 	else
518 		m_OutDbName = args[kInPssmList].AsString();
519 
520 	//Number of SMP files per db vol
521 	m_MaxSmpFilesPerVol = args[kMaxSmpFilesPerVol].AsInteger();
522 
523 	//out_db_type
524 	m_OutDbType = args[kOutDbType].AsString();
525 	if(kOutDbRps == m_OutDbType)
526 		m_op_mode = op_rps;
527 	else if (kOutDbCobalt == m_OutDbType)
528 		m_op_mode = op_cobalt;
529 	else if(kOutDbDelta == m_OutDbType)
530 		m_op_mode = op_delta;
531 	else
532 		NCBI_THROW(CInputException, eInvalidInput,  "Invalid Output database type");
533 
534 	m_CreateIndexFile = args[kOutIndexFile].AsBoolean();
535 
536 	int default_gap_open = 0;
537 	int default_gap_extend = 0;
538 	//matrix
539 	m_Matrix = args[kArgMatrixName].AsString();
540 	 BLAST_GetProteinGapExistenceExtendParams(m_Matrix.c_str(), &default_gap_open, &default_gap_extend);
541 
542 	//gapopen
543 	if(args[kArgGapOpen].HasValue())
544 		m_GapOpenPenalty = args[kArgGapOpen].AsInteger();
545 	else
546 		m_GapOpenPenalty = default_gap_open;
547 
548 	//gapextend
549 	if(args[kArgGapExtend].HasValue())
550 		m_GapExtPenalty = args[kArgGapExtend].AsInteger();
551 	else
552 		m_GapExtPenalty = default_gap_extend;
553 
554 	//pssm scale factor
555 	m_PssmScaleFactor = args[kPssmScaleFactor].AsDouble();
556 
557 	//matrix
558 	m_Matrix = args[kArgMatrixName].AsString();
559 
560 	//Delta Blast Parameters
561 	m_ObsrvThreshold = args[kObsrThreshold].AsDouble();
562 	m_ExcludeInvalid = args[kExcludeInvalid].AsBoolean();
563 
564 	if (args[kUseCmdlineThreshold]){
565 		m_UseModelThreshold = false;
566 	}
567     m_DbVer = static_cast<EBlastDbVersion>(args["blastdb_version"].AsInteger());
568 
569     if (args["taxid"].HasValue()) {
570         _ASSERT( !args["taxid_map"].HasValue() );
571         m_Taxids.Reset(new CTaxIdSet(TAX_ID_FROM(int, args["taxid"].AsInteger())));
572     } else if (args["taxid_map"].HasValue()) {
573         _ASSERT( !args["taxid"].HasValue() );
574         _ASSERT( !m_Taxids.Empty() );
575         m_Taxids->SetMappingFromFile(args["taxid_map"].AsInputFile());
576     }
577 }
578 
x_GetSMPFilenames(void)579 vector<string> CMakeProfileDBApp::x_GetSMPFilenames(void)
580 {
581 	vector<string> filenames;
582 
583 	while(!m_InPssmList->eof())
584 	{
585 		string line;
586 		vector<string> tmp;
587 		NcbiGetlineEOL(*m_InPssmList, line);
588 		NStr::Split(line, " \t\r", tmp, NStr::fSplit_MergeDelimiters | NStr::fSplit_Truncate);
589 
590 		if(tmp.size()  > 0)
591 			filenames.insert(filenames.end(), tmp.begin(), tmp.end() );
592 	}
593 
594 	if( 0 == filenames.size())
595 		NCBI_THROW(CInputException, eInvalidInput,  "Input file contains no smp filnames");
596 
597 	return filenames;
598 }
599 
600 CMakeProfileDBApp::CheckInputScoremat_RV
x_CheckInputScoremat(const CPssmWithParameters & pssm_w_parameters,const string & filename)601 CMakeProfileDBApp::x_CheckInputScoremat(const CPssmWithParameters & pssm_w_parameters,
602 										const string & filename)
603 {
604 	CheckInputScoremat_RV sm = sm_invalid;
605 
606 	if(pssm_w_parameters.IsSetPssm())
607 	{
608 		const CPssm & pssm = pssm_w_parameters.GetPssm();
609 
610 		if(!pssm.IsSetQuery() || (0 == pssm.GetQueryLength()))
611 		{
612 			string err = filename + " contians no bioseq data";
613 			NCBI_THROW(CInputException, eInvalidInput,  err);
614 		}
615 
616 		if(!pssm.IsSetNumRows() || !pssm.IsSetNumColumns())
617 		{
618 			string err = filename + " contians no info on num of columns or num of rows";
619 			NCBI_THROW(CInputException, eInvalidInput,  err);
620 		}
621 
622 		if((int) (pssm.GetQueryLength()) != pssm.GetNumColumns())
623 		{
624 			string err = filename + " 's num of columns does not match size of sequence";
625 			NCBI_THROW(CInputException, eInvalidInput,  err);
626 		}
627 
628 		int num_rows = pssm.GetNumRows();
629 		if( num_rows <= 0 || num_rows > BLASTAA_SIZE )
630 		{
631 			string err = filename + " has invalid alphabet size";
632 			NCBI_THROW(CInputException, eInvalidInput,  err);
633 		}
634 
635 		// First time around
636 		if(eUndefined == m_UpdateFreqRatios)
637 		{
638 			m_UpdateFreqRatios = x_IsUpdateFreqRatios(pssm);
639 		}
640 
641 		if(m_UpdateFreqRatios && (!pssm.IsSetIntermediateData()|| !pssm.GetIntermediateData().IsSetFreqRatios()))
642 		{
643 			string err = filename + " contains no frequence ratios for building database";
644 			NCBI_THROW(CInputException, eInvalidInput,  err);
645 		}
646 
647 		if(op_cobalt == m_op_mode)
648 		{
649 			if(!pssm_w_parameters.IsSetParams() || !pssm_w_parameters.GetParams().IsSetConstraints() ||
650 			! pssm_w_parameters.GetParams().GetConstraints().IsSetBlocks())
651 			{
652 				string err = filename + " contains no core block to build cobalt database";
653 				NCBI_THROW(CInputException, eInvalidInput,  err);
654 			}
655 		}
656 
657 		if(pssm.IsSetFinalData())
658 		{
659 			sm = sm_valid_has_pssm;
660 		}
661 		else if(pssm.IsSetIntermediateData())
662 		{
663 			if(pssm.GetIntermediateData().IsSetFreqRatios())
664 			{
665 				sm = sm_valid_freq_only;
666 			}
667 		}
668 
669 		if(sm_invalid == sm)
670 		{
671 			string err = filename + " contians no pssm or residue frequencies";
672 			NCBI_THROW(CInputException, eInvalidInput,  err);
673 		}
674 	}
675 	else
676 	{
677 		string err = filename + " contians no scoremat";
678 		NCBI_THROW(CInputException, eInvalidInput,  err);
679 	}
680 
681 	return sm;
682 }
683 
x_IsUpdateFreqRatios(const CPssm & p)684 bool CMakeProfileDBApp::x_IsUpdateFreqRatios(const CPssm & p)
685 {
686 	if(op_cobalt == m_op_mode)
687 		return eTrue;
688 
689 	if(!p.IsSetIntermediateData()|| !p.GetIntermediateData().IsSetFreqRatios())
690 		return eFalse;
691 
692 	return eTrue;
693 }
694 
x_InitOutputDb(CRPS_DbInfo & rpsDbInfo)695 void CMakeProfileDBApp::x_InitOutputDb(CRPS_DbInfo & rpsDbInfo)
696 {
697 	CWriteDB::EIndexType index_type = (m_CreateIndexFile == true ? CWriteDB::eDefault : CWriteDB::eNoIndex);
698 	rpsDbInfo.output_db.Reset(new CWriteDB(rpsDbInfo.db_name, CWriteDB::eProtein, m_Title, index_type, m_CreateIndexFile, false, false, m_DbVer));
699 	rpsDbInfo.output_db->SetMaxFileSize(4000000000);
700 	return;
701 }
702 
s_DeleteMakeprofileDb(const string & name)703 static bool s_DeleteMakeprofileDb(const string & name )
704 {
705 	bool isRemoved = false;
706 	static const char * mp_ext[]={".rps", ".loo", ".aux", ".freq", ".blocks", ".wcounts", ".obsr", NULL};
707 	for(const char ** mp=mp_ext; *mp != NULL; mp++) {
708     	CNcbiOstrstream oss;
709     	oss << name << *mp;
710         const string fname = CNcbiOstrstreamToString(oss);
711         if (CFile(fname).Remove()) {
712         	LOG_POST(Info << "Deleted " << fname);
713         }
714         else {
715         	unsigned int index = 0;
716         	string vfname = name + "." + NStr::IntToString(index/10) +
717         			        NStr::IntToString(index%10) + *mp;
718         	while (CFile(vfname).Remove()) {
719         		index++;
720         		vfname = name + "." + NStr::IntToString(index/10) +
721         	    	     NStr::IntToString(index%10) + *mp;
722         	}
723         }
724     }
725 	if(DeleteBlastDb(name, CSeqDB::eProtein))
726 		isRemoved = true;
727 
728 	return isRemoved;
729 }
730 
731 
x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo,Int4 vol,Int4 num_files)732 void CMakeProfileDBApp::x_InitRPSDbInfo(CRPS_DbInfo & rpsDbInfo, Int4 vol, Int4 num_files)
733 {
734 
735      rpsDbInfo.num_seqs = num_files;
736      if(vol == kSingleVol) {
737     	 rpsDbInfo.db_name = m_OutDbName;
738      }
739      else if (vol >= 0) {
740     	 rpsDbInfo.db_name = CWriteDB_File::MakeShortName(m_OutDbName, vol);
741      }
742      else {
743     	 NCBI_THROW(CBlastException, eCoreBlastError,"Invalid vol number");
744      }
745 
746      string rps_str = rpsDbInfo.db_name + ".rps";
747      rpsDbInfo.pssm_file.open(rps_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
748      if (!rpsDbInfo.pssm_file.is_open())
749     	 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .rps file ");
750 
751      string lookup_str = rpsDbInfo.db_name + ".loo";
752      rpsDbInfo.lookup_file.open(lookup_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
753      if (!rpsDbInfo.lookup_file.is_open())
754     	 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .loo file");
755 
756      string aux_str = rpsDbInfo.db_name + ".aux";
757      rpsDbInfo.aux_file.open(aux_str.c_str());
758      if (!rpsDbInfo.aux_file.is_open())
759     	 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .aux file");
760 
761 	 string freq_str = rpsDbInfo.db_name + ".freq";
762 	 rpsDbInfo.freq_file.open(freq_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
763 	 if (!rpsDbInfo.freq_file.is_open())
764 		 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .freq file");
765 
766      /* Write the magic numbers to the PSSM file */
767 
768      Int4 version = RPS_DATABASE_VERSION;
769       rpsDbInfo.pssm_file.write ((char *)&version , sizeof(Int4));
770       rpsDbInfo.freq_file.write ((char *)&version , sizeof(Int4));
771 
772      /* Fill in space for the sequence offsets. The PSSM
773         data gets written after this list of integers. Also
774         write the number of sequences to the PSSM file */
775 
776       rpsDbInfo.pssm_file.write((char *) &num_files, sizeof(Int4));
777       rpsDbInfo.freq_file.write((char *) &num_files, sizeof(Int4));
778      for (Int4 i = 0; i <= num_files; i++)
779      {
780     	 rpsDbInfo.pssm_file.write((char *)&i, sizeof(Int4));
781     	 rpsDbInfo.freq_file.write((char *)&i, sizeof(Int4));
782      }
783 
784      if(op_cobalt == m_op_mode)
785      {
786     	 string blocks_str = rpsDbInfo.db_name + ".blocks";
787     	 rpsDbInfo.blocks_file.open(blocks_str.c_str());
788     	 if (!rpsDbInfo.blocks_file.is_open())
789     		 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .blocks file");
790      }
791 
792 
793      rpsDbInfo.curr_seq_offset = 0;
794      //Init them to input arg values first , may change after reading in the first sequence
795      rpsDbInfo.gap_extend = m_GapExtPenalty;
796      rpsDbInfo.gap_open = m_GapOpenPenalty;
797      rpsDbInfo.matrix = m_Matrix;
798      rpsDbInfo.scale_factor = (Int4) ceil(m_PssmScaleFactor);
799 
800      return;
801  }
802 
803 //For first sequence only
x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo,const CPssmWithParameters & pssm_p)804 void CMakeProfileDBApp::x_UpdateRPSDbInfo(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p)
805 {
806 	if(pssm_p.IsSetParams())
807 	{
808 		if(pssm_p.GetParams().IsSetRpsdbparams())
809 		{
810 			const CFormatRpsDbParameters & rps_db_params = pssm_p.GetParams().GetRpsdbparams();
811 			if(rps_db_params.IsSetGapExtend())
812 				rpsDbInfo.gap_extend = rps_db_params.GetGapExtend();
813 
814 			if(rps_db_params.IsSetGapOpen())
815 			     rpsDbInfo.gap_open = rps_db_params.GetGapOpen();
816 
817 			if(rps_db_params.IsSetMatrixName())
818 			     rpsDbInfo.matrix = rps_db_params.GetMatrixName();
819 		}
820 	}
821 	return;
822 }
823 
x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo,CPssmWithParameters & pssm_p)824 void CMakeProfileDBApp::x_FillInRPSDbParameters(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters &pssm_p )
825 {
826 	if(!pssm_p.IsSetParams())
827 		pssm_p.SetParams();
828 
829 	if(!pssm_p.GetParams().IsSetRpsdbparams())
830 		pssm_p.SetParams().SetRpsdbparams();
831 
832 	CFormatRpsDbParameters & rps_params= pssm_p.SetParams().SetRpsdbparams();
833 	if(!rps_params.IsSetGapExtend())
834 		rps_params.SetGapExtend(rpsDbInfo.gap_extend);
835 	else if(rps_params.GetGapExtend() != rpsDbInfo.gap_extend)
836     	 NCBI_THROW(CBlastException, eCoreBlastError, "Gap extend penalties do not match");
837 
838 	if(!rps_params.IsSetGapOpen())
839 		rps_params.SetGapOpen(rpsDbInfo.gap_open);
840 	else if(rps_params.GetGapOpen() != rpsDbInfo.gap_open)
841     	 NCBI_THROW(CBlastException, eCoreBlastError, "Gap open penalties do not match");
842 
843 	if(!rps_params.IsSetMatrixName())
844 		rps_params.SetMatrixName (rpsDbInfo.matrix);
845 	else if(rps_params.GetMatrixName()!= rpsDbInfo.matrix)
846     	 NCBI_THROW(CBlastException, eCoreBlastError, "Score matrix does not match");
847 
848 	return;
849 }
850 
851 /* Update the input scoremat with a new PSSM and modified
852    statistics. Scoremat must contain only residue frequencies.
853    Note that upon completion the new PSSM will always have
854    columns of length BLASTAA_SIZE
855         seq is the sequence and set of score frequencies read in
856                 from the next data file
857         seq_size is the number of letters in this sequence
858         alphabet_size refers to the number of PSSM rows
859         ScalingFactor is the multiplier for all PSSM scores
860 */
x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo,CPssmWithParameters & seq,Int4 seq_size)861 void CMakeProfileDBApp::x_RPSUpdateStatistics(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & seq, Int4 seq_size )
862 {
863 
864     CPssm & pssm = seq.SetPssm();
865     const CPssmParameters & params = seq.GetParams();
866     string matrix_name = params.GetRpsdbparams().GetMatrixName();
867 
868     /* Read in the sequence residues from the scoremat structure. */
869     CNCBIstdaa query_stdaa;
870     pssm.GetQuerySequenceData(query_stdaa);
871 
872     vector <char>   query_v = query_stdaa.Get();
873 
874     if((Int4) (query_v.size()) != seq_size)
875     	 NCBI_THROW(CBlastException, eCoreBlastError, "Query sequence lengths mismatch");
876 
877     /* allocate query array and PSSM row array */
878     AutoArray<Uint1>  query(seq_size);
879 
880     for(unsigned int i = 0; i < query_v.size(); i++)
881     	query[i] = query_v[i];
882 
883     auto_ptr<CNcbiMatrix <double> > freq_list (CScorematPssmConverter::GetFreqRatios(seq));
884 
885     CPsiBlastInputFreqRatios pssm_freq_ratio(query.get(), seq_size, *freq_list,
886     										 matrix_name.c_str(), rpsDbInfo.gap_open,
887     										 rpsDbInfo.gap_extend, rpsDbInfo.scale_factor);
888     CPssmEngine pssm_engine(&pssm_freq_ratio);
889     CRef<CPssmWithParameters> out_par(pssm_engine.Run());
890 
891     CPssmFinalData & i = pssm.SetFinalData();
892     const CPssmFinalData & o = out_par->GetPssm().GetFinalData();
893     i.SetScores() = o.GetScores();
894     i.SetLambda() = o.GetLambda();
895     i.SetKappa() = o.GetKappa();
896     i.SetH() = o.GetH();
897     i.SetScalingFactor(rpsDbInfo.scale_factor);
898 
899     return;
900 }
901 
902  /* The first sequence in the list determines several
903     parameters that all other sequences in the list must
904     have. In this case, extra initialization is required
905 
906          info contains all the information on data files
907                  and parameters from previously added sequences
908          seq is the sequence and PSSM read in from the next data file
909          seq_index refers to the (0-based) position of this sequence
910                  in the complete list of seqences
911          seq_size is the number of letters in this sequence
912          alphabet_size refers to the number of PSSM rows
913  */
x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo,CPssmWithParameters & pssm_w_parameters,bool freq_only)914  void CMakeProfileDBApp::x_RPSAddFirstSequence(CRPS_DbInfo & rpsDbInfo, CPssmWithParameters & pssm_w_parameters, bool freq_only )
915  {
916      x_UpdateRPSDbInfo(rpsDbInfo, pssm_w_parameters);
917 
918      x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
919      double wordScoreThreshold = m_WordDefaultScoreThreshold;
920 
921      if(!freq_only)
922      {
923     	 if(pssm_w_parameters.GetPssm().GetFinalData().IsSetScalingFactor())
924     	 {
925     		 rpsDbInfo.scale_factor = pssm_w_parameters.GetPssm().GetFinalData().GetScalingFactor();
926     	 }
927     	 else
928     	 {
929     	     // asn1 default value is 1
930     	     rpsDbInfo.scale_factor = 1.0;
931     	 }
932     	 if(m_UseModelThreshold && pssm_w_parameters.GetPssm().GetFinalData().IsSetWordScoreThreshold())
933     	 {
934     	 	 wordScoreThreshold = pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
935     	 }
936      }
937      else
938      {
939     	 x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, pssm_w_parameters.GetPssm().GetQueryLength());
940      }
941 
942      /* scale up the threshold value and convert to integer */
943      double threshold = rpsDbInfo.scale_factor * wordScoreThreshold;
944 
945      /* create BLAST lookup table */
946      if (LookupTableOptionsNew(eBlastTypeBlastp, &(rpsDbInfo.lookup_options)) != 0)
947     	 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot create lookup options");
948 
949      if (BLAST_FillLookupTableOptions(rpsDbInfo.lookup_options, eBlastTypePsiBlast,
950                                       FALSE, /* no megablast */
951                                       threshold, /* neighboring threshold */
952                                       BLAST_WORDSIZE_PROT ) != 0)
953     	 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot set lookup table options");
954 
955      if (BlastAaLookupTableNew(rpsDbInfo.lookup_options, &(rpsDbInfo.lookup)) != 0)
956     	 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot allocate lookup table");
957 
958      rpsDbInfo.lookup->use_pssm = TRUE;  /* manually turn on use of PSSMs */
959 
960      /* Perform generic query setup */
961 
962      if (BlastQuerySetUpOptionsNew(&(rpsDbInfo.query_options)) != 0)
963     	 NCBI_THROW(CBlastException, eCoreBlastError, "Generic query setup failed");
964 
965      if (BLAST_FillQuerySetUpOptions(rpsDbInfo.query_options, eBlastTypeBlastp,
966                                      NULL,        /* no filtering */
967                                      0            /* strand not applicable */ ) != 0)
968     	 NCBI_THROW(CBlastException, eCoreBlastError, "Cannot fill query options");
969 
970      /* Write the header of the RPS .aux file */
971      rpsDbInfo.aux_file << rpsDbInfo.matrix << "\n";
972      rpsDbInfo.aux_file << rpsDbInfo.gap_open << "\n";
973      rpsDbInfo.aux_file << rpsDbInfo.gap_extend << "\n";
974      rpsDbInfo.aux_file << scientific << 0.0 << "\n";
975      rpsDbInfo.aux_file << scientific << 0.0 << "\n";
976      rpsDbInfo.aux_file << (int) 0 << "\n";
977      rpsDbInfo.aux_file << (int) 0 << "\n";
978      rpsDbInfo.aux_file << fixed << (double) rpsDbInfo.scale_factor << "\n";
979 
980      return;
981  }
982 
x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo,const CPssmWithParameters & pssm_p,Int4 seq_size)983  void CMakeProfileDBApp::x_UpdateCobalt(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_size)
984  {
985 	 const CPssm & pssm = pssm_p.GetPssm();
986 	 // Update .blocks file
987 	 const list<CRef<CCoreBlock> > & block_list = pssm_p.GetParams().GetConstraints().GetBlocks();
988 
989 	 list<CRef<CCoreBlock> >::const_iterator  itr = block_list.begin();
990 
991 	 int count =0;
992 
993 	 while(itr != block_list.end())
994 	 {
995 		 const CCoreBlock & block = (**itr);
996 		 if(!block.IsSetStart() || !block.IsSetStop())
997 			NCBI_THROW(CInputException, eInvalidInput,  "No start Or stop found in conserved block");
998 
999 		 string seq_id_str = "id" + NStr::IntToString(count);
1000 		 if(pssm.IsSetQuery())
1001 		 {
1002 			 if(pssm.GetQuery().IsSeq())
1003 			 {
1004 				 if(pssm.GetQuery().GetSeq().IsSetDescr())
1005 				 {
1006 					 const list<CRef<CSeqdesc> > descr_list= pssm.GetQuery().GetSeq().GetDescr();
1007 					 if(descr_list.size() > 0)
1008 					 {
1009 						const CRef<CSeqdesc> descr = descr_list.front();
1010 						if(descr->IsTitle())
1011 						{
1012 							string title = descr->GetTitle();
1013 							string accession;
1014 					 		string tmp;
1015 					 		if(NStr::SplitInTwo(title, ",", accession, tmp))
1016 						 		seq_id_str = accession;
1017 						 }
1018 					 }
1019 				 }
1020 			 }
1021 		 }
1022 
1023 		 rpsDbInfo.blocks_file << seq_id_str << "\t";
1024 		 rpsDbInfo.blocks_file << count << "\t";
1025 		 rpsDbInfo.blocks_file << block.GetStart() << "\t";
1026 		 rpsDbInfo.blocks_file << block.GetStop() << "\n";
1027 		 count++;
1028 		 ++itr;
1029 	 }
1030 	 return;
1031  }
x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo,const CPssmWithParameters & pssm_p,Int4 seq_index,Int4 seq_size)1032 void CMakeProfileDBApp::x_UpdateFreqRatios(CRPS_DbInfo & rpsDbInfo, const CPssmWithParameters & pssm_p, Int4 seq_index, Int4 seq_size)
1033  {
1034 	if (!m_UpdateFreqRatios)
1035 		return;
1036 
1037 	 const CPssm & pssm = pssm_p.GetPssm();
1038 	 // Update .freq file
1039 	 Int4 i = 0;
1040 	 Int4 j = 0;
1041 	 Int4 row[BLASTAA_SIZE];
1042 	 Int4 alphabet_size = pssm.GetNumRows();
1043 
1044 	 const list<double> & freq_ratios = pssm.GetIntermediateData().GetFreqRatios();
1045 	 list<double>::const_iterator itr_fr = freq_ratios.begin();
1046     rpsDbInfo.freq_file.seekp(0, ios_base::end);
1047 
1048 	 if (pssm.GetByRow() == FALSE) {
1049 	    for (i = 0; i < seq_size; i++) {
1050 	        for (j = 0; j < alphabet_size; j++) {
1051 	            if (itr_fr == freq_ratios.end())
1052 	                break;
1053 	            row[j] = (Int4) BLAST_Nint(*itr_fr * FREQ_RATIO_SCALE);
1054 	            ++itr_fr;
1055 	        }
1056 	        for ( ;j < BLASTAA_SIZE; j++) {
1057 	        	row[j] = 0;
1058 	        }
1059 	        rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1060 	    }
1061     }
1062     else {
1063     	auto_ptr<CNcbiMatrix<double> > matrix (CScorematPssmConverter::GetFreqRatios(pssm_p));
1064 
1065 	    for (i = 0; i < seq_size; i++) {
1066 	        for (j = 0; j < BLASTAA_SIZE; j++) {
1067 	            row[j] = (Int4) BLAST_Nint((*matrix)(i,j ) * FREQ_RATIO_SCALE);
1068 	        }
1069 	        rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1070 	    }
1071     }
1072 
1073 	memset(row, 0, sizeof(row));
1074 	rpsDbInfo.freq_file.write((const char *)row, sizeof(Int4)*BLASTAA_SIZE);
1075 
1076     rpsDbInfo.freq_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1077     rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1078 	return;
1079  }
1080 
1081  /* Incrementally update the BLAST lookup table with
1082     words derived from the present sequence
1083          info contains all the information on data files
1084                  and parameters from previously added sequences
1085          seq is the sequence and PSSM read in from the next data file
1086          seq_size is the number of letters in this sequence
1087  */
x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo,Int4 seq_size)1088  void CMakeProfileDBApp::x_RPSUpdateLookup(CRPS_DbInfo & rpsDbInfo, Int4 seq_size)
1089  {
1090      BlastSeqLoc *lookup_segment = NULL;
1091 
1092      /* Tell the blast engine to index the entire input
1093         sequence. Since only the PSSM matters for lookup
1094         table creation, the process does not require
1095         actually extracting the sequence data from 'seq'*/
1096 
1097      BlastSeqLocNew(&lookup_segment, 0, seq_size - 1);
1098 
1099      /* add this sequence to the lookup table. NULL
1100         is passed in place of the query */
1101 
1102      Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1103      if (NULL == posMatrix)
1104     	 NCBI_THROW(CBlastException, eCoreBlastError, "Empty pos matrix");
1105 
1106      BlastAaLookupIndexQuery(rpsDbInfo.lookup, posMatrix,
1107                              NULL, lookup_segment, rpsDbInfo.curr_seq_offset);
1108 
1109      BlastSeqLocFree(lookup_segment);
1110      return;
1111  }
1112 
1113  /* Incrementally update the RPS PSSM file with the
1114     PSSM for the next input sequence
1115          info contains all the information on data files
1116                  and parameters from previously added sequences
1117          seq is the sequence and PSSM read in from the next data file
1118          seq_index refers to the (0-based) position of this sequence
1119                  in the complete list of seqences
1120          seq_size is the number of letters in this sequence
1121          alphabet_size refers to the number of PSSM rows
1122  */
x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo,const CPssm & pssm,Int4 seq_index,Int4 seq_size)1123 void CMakeProfileDBApp::x_RPSUpdatePSSM(CRPS_DbInfo & rpsDbInfo, const CPssm & pssm, Int4 seq_index, Int4 seq_size)
1124 {
1125      Int4 i = 0;
1126      Int4 j = 0;
1127 
1128      /* Note that RPS blast requires an extra column at
1129       * the end of the PSSM */
1130 
1131      list<int>::const_iterator  score_list_itr = pssm.GetFinalData().GetScores().begin();
1132      list<int>::const_iterator  score_list_end = pssm.GetFinalData().GetScores().end();
1133      Int4 alphabet_size = pssm.GetNumRows();
1134 
1135      rpsDbInfo.pos_matrix.Create(seq_size + 1);
1136      Int4 ** posMatrix = rpsDbInfo.pos_matrix.Get();
1137      if (pssm.GetByRow() == FALSE) {
1138          for (i = 0; i < seq_size; i++) {
1139              for (j = 0; j < alphabet_size; j++) {
1140                  if (score_list_itr == score_list_end)
1141                      break;
1142                  posMatrix[i][j] = *score_list_itr;
1143                  score_list_itr++;
1144              }
1145              if (j < alphabet_size)
1146                  break;
1147              for (; j < BLASTAA_SIZE; j++) {
1148                  posMatrix[i][j] = INT2_MIN;
1149              }
1150          }
1151      }
1152      else {
1153          for (j = 0; j < alphabet_size; j++) {
1154              for (i = 0; i < seq_size; i++) {
1155                  if (score_list_itr == score_list_end)
1156                      break;
1157                  posMatrix[i][j] = *score_list_itr;
1158                  score_list_itr++;
1159              }
1160              if (i < seq_size)
1161                  break;
1162          }
1163          if (j == alphabet_size) {
1164              for (; j < BLASTAA_SIZE; j++) {
1165                  for (i = 0; i < seq_size; i++) {
1166                      posMatrix[i][j] = INT2_MIN;
1167                  }
1168              }
1169          }
1170      }
1171 
1172      if (i < seq_size || j < alphabet_size)
1173     	 NCBI_THROW(CBlastException, eCoreBlastError, "PSSM was truncated early");
1174 
1175      if(score_list_itr != score_list_end)
1176     	 NCBI_THROW(CBlastException, eCoreBlastError, "PSSM too large for this sequence");
1177 
1178      /* manually fill in the extra (last) column of the PSSM.
1179         Note that the value to use should more appropriately
1180         be BLAST_SCORE_MIN, but we instead follow the convention
1181         used in copymat */
1182 
1183      for (i = 0; i < BLASTAA_SIZE; i++)
1184         posMatrix[seq_size][i] = -BLAST_SCORE_MAX;
1185 
1186      /* Dump the score matrix, column by column */
1187      rpsDbInfo.pssm_file.seekp(0, ios_base::end);
1188      for (i = 0; i < seq_size + 1; i++) {
1189     	rpsDbInfo.pssm_file.write((const char *) posMatrix[i], sizeof(Int4)*BLASTAA_SIZE);
1190      }
1191      /* Write the next context offset. Note that the
1192         RPSProfileHeader structure is one int too large for
1193         our purposes, so that the index of this sequence
1194         must be decremented to get the right byte offset
1195         into the file */
1196 
1197      rpsDbInfo.pssm_file.seekp( 8 + (seq_index) * sizeof(Int4), ios_base::beg);
1198      rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1199 
1200      return;
1201  }
1202 
1203 /* Once all sequences have been processed, perform
1204    final setup on the BLAST lookup table and finish
1205    up the RPS files */
1206 
x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo)1207 void CMakeProfileDBApp::x_RPS_DbClose(CRPS_DbInfo & rpsDbInfo)
1208 {
1209     /* Write the last context offset to the PSSM file.
1210        This is the total number of letters for all RPS
1211        DB sequences combined */
1212 
1213     rpsDbInfo.pssm_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1214     rpsDbInfo.pssm_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1215     rpsDbInfo.freq_file.seekp(8 + (rpsDbInfo.num_seqs) * sizeof(Int4), ios::beg);
1216     rpsDbInfo.freq_file.write((const char *) &rpsDbInfo.curr_seq_offset, sizeof(Int4));
1217 
1218     /* Pack the lookup table into its compressed form */
1219     if(NULL == rpsDbInfo.lookup)
1220         NCBI_THROW(CBlastException, eCoreBlastError, "Empty database");
1221 
1222     if (BlastAaLookupFinalize(rpsDbInfo.lookup, eBackbone) != 0) {
1223         NCBI_THROW(CBlastException, eCoreBlastError, "Failed to compress lookup table");
1224     }
1225     else {
1226         /* Change the lookup table format to match that
1227            of the legacy BLAST lookup table */
1228 
1229         BlastRPSLookupFileHeader header;
1230         BlastAaLookupTable *lut = rpsDbInfo.lookup;
1231         Int4 i, index;
1232         Int4 cursor, old_cursor;
1233         AaLookupBackboneCell *cell;
1234         RPSBackboneCell empty_cell;
1235 
1236         memset(&header, 0, sizeof(header));
1237         header.magic_number = RPS_DATABASE_VERSION;
1238 
1239         /* for each lookup table cell */
1240 
1241         for (index = cursor = 0; index < lut->backbone_size; index++) {
1242             cell = (AaLookupBackboneCell*)lut->thick_backbone + index;
1243 
1244 
1245             if (cell->num_used == 0)
1246                 continue;
1247 
1248             /* The cell contains hits */
1249 
1250             if (cell->num_used <= RPS_HITS_PER_CELL) {
1251                 /* if 3 hits or less, just update each hit offset
1252                    to point to the end of the word rather than
1253                    the beginning */
1254 
1255                 for (i = 0; i < cell->num_used; i++)
1256                     cell->payload.entries[i] += BLAST_WORDSIZE_PROT - 1;
1257             }
1258             else {
1259                 /* if more than 3 hits, pack the first hit into the
1260                    lookup table cell, pack the overflow array byte
1261                    offset into the cell, and compress the resulting
1262                    'hole' in the overflow array. Update the hit
1263                    offsets as well */
1264 
1265                 old_cursor = cell->payload.overflow_cursor;
1266                 cell->payload.entries[0] = ((Int4*)lut->overflow)[old_cursor] +
1267                                         BLAST_WORDSIZE_PROT - 1;
1268                 cell->payload.entries[1] = cursor * sizeof(Int4);
1269                 for (i = 1; i < cell->num_used; i++, cursor++) {
1270                     ((Int4*)lut->overflow)[cursor]
1271                             = ((Int4*)lut->overflow)[old_cursor + i] +
1272                                         BLAST_WORDSIZE_PROT - 1;
1273                 }
1274             }
1275         }
1276 
1277         header.start_of_backbone = sizeof(header);
1278         header.end_of_overflow = header.start_of_backbone +
1279                    (RPS_NUM_LOOKUP_CELLS + 1) * sizeof(RPSBackboneCell) +
1280                    cursor * sizeof(Int4);
1281 
1282         /* write the lookup file header */
1283 
1284         rpsDbInfo.lookup_file.write((const char *)&header, sizeof(header));
1285 
1286         /* write the thick backbone */
1287 
1288         rpsDbInfo.lookup_file.write((const char *)lut->thick_backbone,
1289         							  sizeof(RPSBackboneCell)* lut->backbone_size);
1290 
1291         /* write extra backbone cells */
1292         memset(&empty_cell, 0, sizeof(empty_cell));
1293         for (i = lut->backbone_size; i < RPS_NUM_LOOKUP_CELLS + 1; i++) {
1294             rpsDbInfo.lookup_file.write((const char *)&empty_cell, sizeof(empty_cell));
1295         }
1296 
1297         /* write the new overflow array */
1298         rpsDbInfo.lookup_file.write((const char *)lut->overflow, sizeof(Int4)*cursor);
1299     }
1300 
1301     /* Free data, close files */
1302 
1303     rpsDbInfo.lookup = BlastAaLookupTableDestruct(rpsDbInfo.lookup);
1304     rpsDbInfo.query_options = BlastQuerySetUpOptionsFree(rpsDbInfo.query_options);
1305     rpsDbInfo.lookup_file.flush();
1306     rpsDbInfo.lookup_file.close();
1307     rpsDbInfo.pssm_file.flush();
1308     rpsDbInfo.pssm_file.close();
1309     rpsDbInfo.aux_file.flush();
1310     rpsDbInfo.aux_file.close();
1311 	rpsDbInfo.freq_file.flush();
1312 	rpsDbInfo.freq_file.close();
1313 
1314     if(op_cobalt == m_op_mode)
1315     {
1316     	rpsDbInfo.blocks_file.flush();
1317     	rpsDbInfo.blocks_file.close();
1318     }
1319     else if(!m_UpdateFreqRatios)
1320     {
1321     	string freq_str = m_OutDbName + ".freq";
1322 	 	CFile(freq_str).Remove();
1323     }
1324 
1325 }
1326 
Init(void)1327 void CMakeProfileDBApp::Init(void)
1328 {
1329 	x_SetupArgDescriptions();
1330 	x_InitProgramParameters();
1331 }
1332 
s_HasDefline(const CBioseq & bio)1333 static bool s_HasDefline(const CBioseq & bio)
1334 {
1335     if (bio.CanGetDescr()) {
1336         return true;
1337     }
1338 
1339     return false;
1340 }
1341 
s_GenerateBlastDefline(const CBioseq & bio)1342 static CRef<CBlast_def_line_set> s_GenerateBlastDefline(const CBioseq & bio)
1343 {
1344 	 CRef<CBlast_def_line_set> defline_set(new CBlast_def_line_set());
1345 	 CRef<CBlast_def_line> defline(new CBlast_def_line());
1346 	 defline->SetSeqid() = bio.GetId();
1347 	 defline_set->Set().push_back(defline);
1348 	 return defline_set;
1349 }
1350 
x_Run(void)1351 int CMakeProfileDBApp::x_Run(void)
1352 {
1353 	 CBuildDatabase::CreateDirectories(m_OutDbName);
1354 	 if ( s_DeleteMakeprofileDb(m_OutDbName)) {
1355 		 *m_LogFile << "Deleted existing BLAST database with identical name." << endl;
1356 	 }
1357 	vector<string> smpFilenames = (op_delta == m_op_mode )? x_CreateDeltaList():x_GetSMPFilenames();
1358 	int num_smps = smpFilenames.size();
1359 	m_NumOfVols = num_smps/m_MaxSmpFilesPerVol + 1;
1360 	int num_seqs = num_smps/m_NumOfVols;
1361 	int residue_seqs = num_smps % m_NumOfVols;
1362 	if(m_NumOfVols == 1) {
1363 		x_MakeVol( -1, smpFilenames);
1364 		m_Done = true;
1365 		return 0;
1366 	}
1367 	else {
1368 		vector<string>::iterator b = smpFilenames.begin();
1369 		vector<string>::iterator r = b + num_seqs;
1370 		for(int i=0; i < m_NumOfVols; i++) {
1371 			vector<string> vol_smps(b, r);
1372 			x_MakeVol(i, vol_smps);
1373 			b= r;
1374 			r = b + num_seqs;
1375 			if(residue_seqs > 0) {
1376 				r++;
1377 				residue_seqs--;
1378 			}
1379 		}
1380 		_ASSERT(b==smpFilenames.end());
1381 	}
1382 	if (m_NumOfVols == m_VolNames.size()) {
1383 		x_CreateAliasFile();
1384 		m_Done = true;
1385 	}
1386 	return 0;
1387 }
1388 
x_MakeVol(Int4 vol,vector<string> & smps)1389 void CMakeProfileDBApp::x_MakeVol(Int4 vol, vector<string> & smps)
1390 {
1391 
1392 	CRPS_DbInfo rpsDbInfo;
1393 	x_InitRPSDbInfo(rpsDbInfo, vol, smps.size());
1394 	m_VolNames.push_back(rpsDbInfo.db_name);
1395 	x_InitOutputDb(rpsDbInfo);
1396 
1397 	for(int seq_index=0; seq_index < rpsDbInfo.num_seqs; seq_index++)
1398 	{
1399 		string filename = smps[seq_index];
1400 		CFile f(filename);
1401 		if(!f.Exists())
1402 		{
1403 			string err = filename + " does not exists";
1404 			NCBI_THROW(CInputException, eInvalidInput,  err);
1405 		}
1406 
1407 		//Read PssmWithParameters from file
1408 		CPssmWithParameters pssm_w_parameters;
1409 		if(m_binary_scoremat)
1410 		{
1411 			CNcbiIfstream	in_stream(filename.c_str(), ios::binary);
1412 			in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1413 		}
1414 		else
1415 		{
1416 			CNcbiIfstream	in_stream(filename.c_str());
1417 			in_stream >> MSerial_AsnText >> pssm_w_parameters;
1418 		}
1419 
1420 		CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1421 		// Should have error out already....
1422 		if(sm_invalid == sm)
1423 		{
1424 			string err = filename + " contains invalid scoremat";
1425 			NCBI_THROW(CInputException, eInvalidInput,  err);
1426 		}
1427 
1428 		const CPssm & pssm = pssm_w_parameters.GetPssm();
1429 		int seq_size = pssm.GetQueryLength();
1430 
1431 		const CBioseq & bioseq = pssm.GetQuery().GetSeq();
1432 		CRef<CBlast_def_line_set> deflines;
1433 		if(s_HasDefline(bioseq)) {
1434 			deflines = CWriteDB::ExtractBioseqDeflines(bioseq);
1435 		}
1436 		else {
1437 			deflines = s_GenerateBlastDefline(bioseq);
1438 		}
1439 
1440 		m_Taxids->FixTaxId(deflines);
1441 		rpsDbInfo.output_db->AddSequence(bioseq);
1442 		rpsDbInfo.output_db->SetDeflines(*deflines);
1443 
1444 		//Complete RpsDnInfo init with data from first file
1445 		if(NULL == rpsDbInfo.lookup)
1446 		{
1447 			x_RPSAddFirstSequence( rpsDbInfo, pssm_w_parameters, sm == sm_valid_freq_only);
1448 		}
1449 		else
1450 		{
1451 			 x_FillInRPSDbParameters(rpsDbInfo, pssm_w_parameters);
1452 			if(sm_valid_freq_only == sm){
1453 				x_RPSUpdateStatistics(rpsDbInfo, pssm_w_parameters, seq_size);
1454 			}
1455 
1456 			if( pssm.GetFinalData().IsSetScalingFactor())
1457 			{
1458 				if( pssm.GetFinalData().GetScalingFactor() != rpsDbInfo.scale_factor) {
1459 					NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1460 				}
1461 			}
1462             else
1463             {
1464                 // If scaling factor not specified, the default is 1
1465                 if( 1 != rpsDbInfo.scale_factor) {
1466 					NCBI_THROW(CBlastException, eCoreBlastError, "Scaling factors do not match");
1467                 }
1468             }
1469 
1470 			if(m_UseModelThreshold && pssm.GetFinalData().IsSetWordScoreThreshold()) {
1471 			 	rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * pssm_w_parameters.GetPssm().GetFinalData().GetWordScoreThreshold();
1472 			}
1473 			else {
1474 				rpsDbInfo.lookup->threshold = rpsDbInfo.scale_factor * m_WordDefaultScoreThreshold;
1475 			}
1476 
1477 		}
1478 
1479 		x_RPSUpdatePSSM(rpsDbInfo, pssm, seq_index, seq_size);
1480 		x_RPSUpdateLookup(rpsDbInfo, seq_size);
1481 		x_UpdateFreqRatios(rpsDbInfo, pssm_w_parameters, seq_index, seq_size);
1482 
1483 		rpsDbInfo.aux_file << seq_size << "\n";
1484 		rpsDbInfo.aux_file << scientific << pssm.GetFinalData().GetKappa() << "\n";
1485 		rpsDbInfo.curr_seq_offset +=(seq_size +1);
1486 		rpsDbInfo.pos_matrix.Delete();
1487 
1488 		if(op_cobalt == m_op_mode) {
1489 			x_UpdateCobalt(rpsDbInfo, pssm_w_parameters, seq_size);
1490 		}
1491 	}
1492 
1493 	if(op_delta == m_op_mode) {
1494 		x_UpdateDelta(rpsDbInfo, smps);
1495 	}
1496 	rpsDbInfo.output_db->Close();
1497 	x_RPS_DbClose(rpsDbInfo);
1498 }
1499 
s_WriteInt4List(CNcbiOfstream & ostr,const list<Int4> & l)1500 static void s_WriteInt4List(CNcbiOfstream & ostr, const list<Int4> & l)
1501 {
1502 	ITERATE(list<Int4>, it, l)
1503 	{
1504 		ostr.write((char*)&(*it), sizeof(Int4));
1505 	}
1506 }
1507 
s_WriteUint4List(CNcbiOfstream & ostr,const list<Uint4> & l)1508 static void s_WriteUint4List(CNcbiOfstream & ostr, const list<Uint4> & l)
1509 {
1510 	ITERATE(list<Uint4>, it, l)
1511 	{
1512 		ostr.write((char*)&(*it), sizeof(Uint4));
1513 	}
1514 }
1515 
x_CreateDeltaList(void)1516 vector<string> CMakeProfileDBApp::x_CreateDeltaList(void)
1517 {
1518 	vector<string> smpFilenames = x_GetSMPFilenames();
1519 	vector<string> deltaList;
1520 
1521 	for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1522 	{
1523 		string filename = smpFilenames[seq_index];
1524 		CFile f(filename);
1525 		if(!f.Exists())
1526 		{
1527 			string err = filename + " does not exists";
1528 			NCBI_THROW(CInputException, eInvalidInput,  err);
1529 		}
1530 
1531 		//Read PssmWithParameters from file
1532 		CPssmWithParameters pssm_w_parameters;
1533 		if(m_binary_scoremat)
1534 		{
1535 			CNcbiIfstream	in_stream(filename.c_str(), ios::binary);
1536 			in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1537 		}
1538 		else
1539 		{
1540 			CNcbiIfstream	in_stream(filename.c_str());
1541 			in_stream >> MSerial_AsnText >> pssm_w_parameters;
1542 		}
1543 
1544 		CheckInputScoremat_RV sm = x_CheckInputScoremat(pssm_w_parameters, filename);
1545 		// Should have error out already....
1546 		if(sm_invalid == sm)
1547 		{
1548 			string err = filename + " contains invalid scoremat";
1549 			NCBI_THROW(CInputException, eInvalidInput,  err);
1550 		}
1551 
1552 		const CPssm & pssm = pssm_w_parameters.GetPssm();
1553 		int seq_size = pssm.GetQueryLength();
1554 		if(!pssm.IsSetIntermediateData()|| !pssm.GetIntermediateData().IsSetWeightedResFreqsPerPos())
1555 		{
1556 			string err = filename + " contains no weighted residue frequencies for building delta database";
1557 			NCBI_THROW(CInputException, eInvalidInput,  err);
1558 		}
1559 
1560 		if(!pssm.GetIntermediateData().IsSetNumIndeptObsr())
1561 		{
1562 			string err = filename + " contains no observations information for building delta database";
1563 			NCBI_THROW(CInputException, eInvalidInput,  err);
1564 		}
1565 
1566 		if (true == x_CheckDelta(pssm, seq_size, filename))
1567 		{
1568 			deltaList.push_back(filename);
1569 		}
1570 	}
1571 
1572 	return deltaList;
1573 }
1574 
x_UpdateDelta(CRPS_DbInfo & rpsDbInfo,vector<string> & smpFilenames)1575 void CMakeProfileDBApp::x_UpdateDelta(CRPS_DbInfo & rpsDbInfo, vector<string> & smpFilenames)
1576 {
1577 	CTmpFile tmp_obsr_file(CTmpFile::eRemove);
1578 	CTmpFile tmp_freq_file(CTmpFile::eRemove);
1579 	CNcbiOfstream  tmp_obsr_buff(tmp_obsr_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1580 	CNcbiOfstream  tmp_freq_buff(tmp_freq_file.GetFileName().c_str(), IOS_BASE::out | IOS_BASE::binary);
1581 
1582     list<Int4> FreqOffsets;
1583     list<Int4> ObsrOffsets;
1584     Int4 CurrFreqOffset = 0;
1585     Int4 CurrObsrOffset= 0;
1586 
1587 	for(unsigned int seq_index=0; seq_index < smpFilenames.size(); seq_index++)
1588 	{
1589 		string filename = smpFilenames[seq_index];
1590 		//Read PssmWithParameters from file
1591 		CPssmWithParameters pssm_w_parameters;
1592 		if(m_binary_scoremat)
1593 		{
1594 			CNcbiIfstream	in_stream(filename.c_str(), ios::binary);
1595 			in_stream >> MSerial_AsnBinary >> pssm_w_parameters;
1596 		}
1597 		else
1598 		{
1599 			CNcbiIfstream	in_stream(filename.c_str());
1600 			in_stream >> MSerial_AsnText >> pssm_w_parameters;
1601 		}
1602 
1603 		const CPssm & pssm = pssm_w_parameters.GetPssm();
1604 		int seq_size = pssm.GetQueryLength();
1605 
1606 	    // get weightd residue frequencies
1607 	   const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1608 
1609 	    // get number of independent observations
1610 	    const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1611 
1612 	    int alphabet_size = pssm.GetNumRows();
1613 	    list<double> modify_freqs;
1614 
1615 	    if(pssm.GetByRow())
1616 	    {
1617 	    	// need to flip the freq matrix
1618 	    	vector<double> tmp(orig_freqs.size());
1619 	    	list<double>::const_iterator f_itr = orig_freqs.begin();
1620 
1621 	    	for(int i = 0; i < alphabet_size; i++)
1622 	    	{
1623 	    		for(int j = 0; j < seq_size; j++)
1624 	    		{
1625 	    			tmp[i + j*alphabet_size] = *f_itr;
1626 	    			++f_itr;
1627 	    		}
1628 	    	}
1629 	    	copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1630 	    }
1631 
1632 	    // Pad matrix if necessary
1633 	  	if(alphabet_size < BLASTAA_SIZE)
1634 	  	{
1635 	  		if(0 == modify_freqs.size())
1636 	  			copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1637 
1638 	  		list<double>::iterator p_itr = modify_freqs.begin();
1639 
1640 	  		for (int j=0; j < seq_size; j++)
1641 	  		{
1642 	  			for(int i=0; i < alphabet_size; i++)
1643 	  			{
1644 	  				if(modify_freqs.end() == p_itr)
1645 	  					break;
1646 
1647 	  				++p_itr;
1648 	  			}
1649 
1650 	  			modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1651 	  		}
1652 	    }
1653 
1654 	  	const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1655 
1656 	    //save offset for this record
1657 	    ObsrOffsets.push_back(CurrObsrOffset);
1658 
1659 	    list<Uint4> ObsrBuff;
1660 	    // write effective observations in compressed form
1661 	    // as a list of pairs: value, number of occurences
1662 	    unsigned int num_obsr_columns = 0;
1663 	    list<double>::const_iterator obsr_it = obsr.begin();
1664 	    do
1665 	    {
1666 	    	double current = *obsr_it;
1667 	        Uint4 num = 1;
1668 	        num_obsr_columns++;
1669 	        obsr_it++;
1670 	        while (obsr_it != obsr.end() && fabs(*obsr_it - current) < 1e-4)
1671 	        {
1672 	        	obsr_it++;
1673 	            num++;
1674 	            num_obsr_columns++;
1675 	        }
1676 
1677 	        // +1 because pssm engine returns alpha (in psi-blast papers)
1678 	        // which is number of independent observations - 1
1679 	        ObsrBuff.push_back((Uint4)((current + 1.0) * kFixedPointScaleFactor));
1680 	        ObsrBuff.push_back(num);
1681 	    }
1682 	  	while (obsr_it != obsr.end());
1683 
1684 	    Uint4 num_weighted_counts = 0;
1685 
1686 	    // save offset for this frequencies record
1687 	    FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1688 
1689 	    list<Uint4> FreqBuff;
1690 	    // save weighted residue frequencies
1691 	    ITERATE (list<double>, it, freqs)
1692 	    {
1693 	      	FreqBuff.push_back((Uint4)(*it * kFixedPointScaleFactor));
1694 	      	num_weighted_counts++;
1695 	    }
1696 
1697 	    if (num_obsr_columns != num_weighted_counts / BLASTAA_SIZE)
1698 	    {
1699 	    	string err = "Number of frequencies and observations columns do not match in " + filename;
1700 	    	NCBI_THROW(CException, eInvalid, err);
1701 	    }
1702 
1703 	    // additional column of zeros is added for compatibility with rps database
1704 	    unsigned int padded_size = FreqBuff.size() + BLASTAA_SIZE;
1705 	    FreqBuff.resize(padded_size, 0);
1706 
1707 	    CurrFreqOffset += FreqBuff.size();
1708 	    CurrObsrOffset += ObsrBuff.size();
1709 	    s_WriteUint4List(tmp_freq_buff, FreqBuff);
1710 	    s_WriteUint4List(tmp_obsr_buff, ObsrBuff);
1711 
1712 	}
1713 
1714 	tmp_obsr_buff.flush();
1715 	tmp_freq_buff.flush();
1716 	x_WrapUpDelta(rpsDbInfo, tmp_obsr_file, tmp_freq_file, FreqOffsets, ObsrOffsets, CurrFreqOffset, CurrObsrOffset);
1717 }
1718 
1719 
x_ValidateCd(const list<double> & freqs,const list<double> & observ,unsigned int alphabet_size)1720 bool CMakeProfileDBApp::x_ValidateCd(const list<double>& freqs,
1721                                      const list<double>& observ,
1722                                      unsigned int alphabet_size)
1723 {
1724 
1725     if (freqs.size() / alphabet_size != observ.size())
1726     {
1727     	string err = "Number of frequency and observations columns do not match";
1728         NCBI_THROW(CException, eInvalid, err);
1729     }
1730 
1731     ITERATE (list<double>, it, freqs)
1732     {
1733         unsigned int residue = 0;
1734         double sum = 0.0;
1735         while (residue < alphabet_size - 1)
1736         {
1737             sum += *it;
1738             it++;
1739             residue++;
1740         }
1741         sum += *it;
1742 
1743         if (fabs(sum - 1.0) > kEpsylon)
1744             return false;
1745     }
1746 
1747     ITERATE (list<double>, it, observ)
1748     {
1749            if (*it < 1.0)
1750                return false;
1751    }
1752 
1753     return true;
1754 }
1755 
1756 
x_CheckDelta(const CPssm & pssm,Int4 seq_size,const string & filename)1757 bool CMakeProfileDBApp::x_CheckDelta( const CPssm  & pssm, Int4 seq_size, const string & filename)
1758 {
1759     // get weightd residue frequencies
1760    const list<double>& orig_freqs = pssm.GetIntermediateData().GetWeightedResFreqsPerPos();
1761 
1762     // get number of independent observations
1763     const list<double>& obsr = pssm.GetIntermediateData().GetNumIndeptObsr();
1764 
1765     int alphabet_size = pssm.GetNumRows();
1766     list<double> modify_freqs;
1767 
1768     if(pssm.GetByRow())
1769     {
1770     	// need to flip the freq matrix
1771     	vector<double> tmp(orig_freqs.size());
1772     	list<double>::const_iterator f_itr = orig_freqs.begin();
1773 
1774     	for(int i = 0; i < alphabet_size; i++)
1775     	{
1776     		for(int j = 0; j < seq_size; j++)
1777     		{
1778     			tmp[i + j*alphabet_size] = *f_itr;
1779     			++f_itr;
1780     		}
1781     	}
1782     	copy(tmp.begin(), tmp.end(), modify_freqs.begin());
1783     }
1784 
1785     // Pad matrix if necessary
1786   	if(alphabet_size < BLASTAA_SIZE)
1787   	{
1788   		if(0 == modify_freqs.size())
1789   			copy(orig_freqs.begin(), orig_freqs.end(), modify_freqs.begin());
1790 
1791   		list<double>::iterator p_itr = modify_freqs.begin();
1792 
1793   		for (int j=0; j < seq_size; j++)
1794   		{
1795   			for(int i=0; i < alphabet_size; i++)
1796   			{
1797   				if(modify_freqs.end() == p_itr)
1798   					break;
1799 
1800   				++p_itr;
1801   			}
1802 
1803   			modify_freqs.insert(p_itr, (BLASTAA_SIZE-alphabet_size), 0);
1804   		}
1805     }
1806 
1807   	const list<double> & freqs = (modify_freqs.size()? modify_freqs:orig_freqs );
1808     double max_obsr = *max_element(obsr.begin(), obsr.end()) + 1.0;
1809     if(max_obsr < m_ObsrvThreshold)
1810     {
1811     	*m_LogFile << filename +
1812     			" was excluded: due to too few independent observations\n";
1813     	return false;
1814     }
1815 
1816     if( !x_ValidateCd(freqs, obsr, BLASTAA_SIZE) && m_ExcludeInvalid)
1817     {
1818     	*m_LogFile << filename +
1819     			" was excluded: it conatins an invalid CD \n";
1820     	return false;
1821     }
1822     return true;
1823 }
1824 
1825 
1826 
x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo,CTmpFile & tmp_obsr_file,CTmpFile & tmp_freq_file,list<Int4> & FreqOffsets,list<Int4> & ObsrOffsets,Int4 CurrFreqOffset,Int4 CurrObsrOffset)1827 void CMakeProfileDBApp::x_WrapUpDelta(CRPS_DbInfo & rpsDbInfo, CTmpFile & tmp_obsr_file, CTmpFile & tmp_freq_file,
1828 									  list<Int4> & FreqOffsets, list<Int4> & ObsrOffsets, Int4 CurrFreqOffset, Int4 CurrObsrOffset)
1829 {
1830     FreqOffsets.push_back(CurrFreqOffset / BLASTAA_SIZE);
1831     ObsrOffsets.push_back(CurrObsrOffset);
1832 
1833     string wcounts_str = rpsDbInfo.db_name + ".wcounts";
1834     CNcbiOfstream wcounts_file(wcounts_str.c_str(), ios::out | ios::binary);
1835     if (!wcounts_file.is_open())
1836     	 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .wcounts file");
1837 
1838     string obsr_str = rpsDbInfo.db_name + ".obsr";
1839     CNcbiOfstream 	 obsr_file(obsr_str.c_str(), IOS_BASE::out|IOS_BASE::binary);
1840     if (!obsr_file.is_open())
1841     	 NCBI_THROW(CSeqDBException, eFileErr,"Failed to open output .obsr file");
1842 
1843  	CNcbiIfstream tmp_obsr_buff (tmp_obsr_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1844  	CNcbiIfstream tmp_freq_buff (tmp_freq_file.GetFileName().c_str(), IOS_BASE::in | IOS_BASE::binary);
1845 
1846     // write RPS BLAST database magic number
1847     Int4 magic_number = RPS_MAGIC_NUM_28;
1848     wcounts_file.write((char*)&magic_number, sizeof(Int4));
1849     obsr_file.write((char*)&magic_number, sizeof(Int4));
1850 
1851     // write number of recrods
1852     Int4 num_wcounts_records = FreqOffsets.size() -1;
1853     Int4 num_obsr_records = ObsrOffsets.size() -1;
1854     wcounts_file.write((char*)&num_wcounts_records, sizeof(Int4));
1855     obsr_file.write((char*)&num_obsr_records, sizeof(Int4));
1856 
1857     s_WriteInt4List(wcounts_file, FreqOffsets);
1858     wcounts_file.flush();
1859     wcounts_file << tmp_freq_buff.rdbuf();
1860     wcounts_file.flush();
1861     wcounts_file.close();
1862 
1863     s_WriteInt4List(obsr_file, ObsrOffsets);
1864     obsr_file.flush();
1865     obsr_file << tmp_obsr_buff.rdbuf();
1866     obsr_file.flush();
1867     obsr_file.close();
1868 }
1869 
x_CreateAliasFile(void)1870 void CMakeProfileDBApp::x_CreateAliasFile(void)
1871 {
1872 	vector<string> v;
1873 	for(unsigned int i=0; i < m_VolNames.size(); i++) {
1874 		string t = kEmptyStr;
1875 		CSeqDB_Substring s = SeqDB_RemoveDirName(CSeqDB_Substring( m_VolNames[i]));
1876 		s.GetString(t);
1877 		v.push_back(t);
1878 	}
1879 	CWriteDB_CreateAliasFile(m_OutDbName , v,
1880 	                                 CWriteDB::eProtein, kEmptyStr, m_Title, eNoAliasFilterType);
1881 }
1882 
Run(void)1883 int CMakeProfileDBApp::Run(void)
1884 {
1885 	int status = 0;
1886 	try { x_Run(); }
1887 	catch(const blast::CInputException& e)  {
1888 		 ERR_POST(Error << "INPUT ERROR: " << e.GetMsg());
1889 		 status = BLAST_INPUT_ERROR;
1890 	}
1891 	catch (const CSeqDBException& e) {
1892          ERR_POST(Error << "ERROR: " << e.GetMsg());
1893 	     status = BLAST_DATABASE_ERROR;
1894 	}
1895 	catch (const blast::CBlastException& e) {
1896 		 ERR_POST(Error << "ERROR: " << e.GetMsg());
1897 		 status = BLAST_INPUT_ERROR;
1898 	}
1899 	catch (const CException& e) {
1900         ERR_POST(Error << "ERROR: " << e.GetMsg());
1901 	    status = BLAST_UNKNOWN_ERROR;
1902 	}
1903 	catch (...) {
1904         ERR_POST(Error << "Error: Unknown exception");
1905 	    status = BLAST_UNKNOWN_ERROR;
1906 	}
1907 
1908 	x_AddCmdOptions();
1909     m_UsageReport.AddParam(CBlastUsageReport::eExitStatus, status);
1910 	return status;
1911 }
1912 
x_AddCmdOptions(void)1913 void CMakeProfileDBApp::x_AddCmdOptions(void)
1914 {
1915 	const CArgs & args = GetArgs();
1916     if (args["dbtype"].HasValue()) {
1917     	 m_UsageReport.AddParam(CBlastUsageReport::eDBType, args["dbtype"].AsString());
1918     }
1919     if(args["taxid"].HasValue() || args["taxid_map"].HasValue()) {
1920     	 m_UsageReport.AddParam(CBlastUsageReport::eTaxIdList, true);
1921 	}
1922 }
1923 
1924 
1925 #ifndef SKIP_DOXYGEN_PROCESSING
main(int argc,const char * argv[])1926 int main(int argc, const char* argv[] /*, const char* envp[]*/)
1927 {
1928 		return CMakeProfileDBApp().AppMain(argc, argv);
1929 }
1930 
1931 
1932 
1933 
1934 #endif /* SKIP_DOXYGEN_PROCESSING */
1935 
1936