1 /* ===========================================================================
2 *
3 * PUBLIC DOMAIN NOTICE
4 * National Center for Biotechnology Information
5 *
6 * This software/database is a "United States Government Work" under the
7 * terms of the United States Copyright Act. It was written as part of
8 * the author's official duties as a United States Government employee and
9 * thus cannot be copyrighted. This software/database is freely available
10 * to the public for use. The National Library of Medicine and the U.S.
11 * Government have not placed any restriction on its use or reproduction.
12 *
13 * Although all reasonable efforts have been taken to ensure the accuracy
14 * and reliability of the software and data, the NLM and the U.S.
15 * Government do not and cannot warrant the performance or results that
16 * may be obtained by using this software or data. The NLM and the U.S.
17 * Government disclaim all warranties, express or implied, including
18 * warranties of performance, merchantability or fitness for any particular
19 * purpose.
20 *
21 * Please cite the author in any work or product based on this material.
22 *
23 * ===========================================================================
24 *
25 * Author: Christiam Camacho
26 *
27 */
28
29 /** @file rps_aux.cpp
30 * Implements auxiliary classes to manage RPS-BLAST related C-structures
31 */
32
33
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <algo/blast/api/blast_exception.hpp>
37 #include <objtools/blast/seqdb_reader/seqdb.hpp>
38 #include <algo/blast/api/rps_aux.hpp>
39 #include <algo/blast/core/blast_def.h>
40 #include <memory>
41
42 /** @addtogroup AlgoBlast
43 *
44 * @{
45 */
46
47 BEGIN_NCBI_SCOPE
48 BEGIN_SCOPE(blast)
49
50 /////////////////////////////////////////////////////////////////////////////
51 //
52 // CBlastRPSAuxInfo
53 //
54 /////////////////////////////////////////////////////////////////////////////
55
56 /// Wrapper class to manage the BlastRPSAuxInfo structure, as currently
57 /// there aren't any allocation or deallocation functions for this structure in
58 /// the CORE of BLAST. This class is meant to be kept in a CRef<>. Note that
59 /// only the fields currently used are being passed to the constructor
60 class CBlastRPSAuxInfo : public CObject {
61 public:
62 /// Parametrized constructor
63 /// @param matrix name of the scoring matrix used to build the RPS-BLAST
64 /// database
65 /// @param gap_open gap opening cost
66 /// @param gap_extend gap extension cost
67 /// @param scale_factor scaling factor
68 /// @param karlin_k statistical K parameter calculated when building the
69 /// RPS-BLAST database
70 CBlastRPSAuxInfo(const string& matrix,
71 int gap_open,
72 int gap_extend,
73 double scale_factor,
74 const vector<double>& karlin_k);
75
76 /// Destructor
77 ~CBlastRPSAuxInfo();
78
79 /// Lend the caller the pointer to the data structure this object manages.
80 /// Caller MUST NOT deallocate the return value.
81 const BlastRPSAuxInfo* operator()() const;
82 private:
83 /// Prohibit copy-constructor
84 CBlastRPSAuxInfo(const CBlastRPSAuxInfo& rhs);
85 /// Prohibit assignment operator
86 CBlastRPSAuxInfo& operator=(const CBlastRPSAuxInfo& rhs);
87
88 /// Deallocates the structure owned by this class
89 void x_DoDestroy();
90
91 /// The data structure this class manages
92 BlastRPSAuxInfo* m_Data;
93 };
94
CBlastRPSAuxInfo(const string & matrix,int gap_open,int gap_extend,double scale_factor,const vector<double> & karlin_k)95 CBlastRPSAuxInfo::CBlastRPSAuxInfo(const string& matrix,
96 int gap_open,
97 int gap_extend,
98 double scale_factor,
99 const vector<double>& karlin_k)
100 : m_Data(0)
101 {
102 _ASSERT(!matrix.empty());
103 _ASSERT(!karlin_k.empty());
104
105 try {
106 m_Data = new BlastRPSAuxInfo;
107 memset(m_Data, 0, sizeof(BlastRPSAuxInfo));
108 m_Data->orig_score_matrix = strdup(matrix.c_str());
109 m_Data->gap_open_penalty = gap_open;
110 m_Data->gap_extend_penalty = gap_extend;
111 m_Data->scale_factor = scale_factor;
112 m_Data->karlin_k = new double[karlin_k.size()];
113 copy(karlin_k.begin(), karlin_k.end(), &m_Data->karlin_k[0]);
114 } catch (const bad_alloc&) {
115 x_DoDestroy();
116 NCBI_THROW(CBlastSystemException, eOutOfMemory,
117 "Failed to allocate memory for BlastRPSAuxInfo structure");
118 }
119 }
120
~CBlastRPSAuxInfo()121 CBlastRPSAuxInfo::~CBlastRPSAuxInfo()
122 {
123 x_DoDestroy();
124 }
125
126 const BlastRPSAuxInfo*
operator ()() const127 CBlastRPSAuxInfo::operator()() const
128 {
129 return m_Data;
130 }
131
132 void
x_DoDestroy()133 CBlastRPSAuxInfo::x_DoDestroy()
134 {
135 if ( !m_Data ) {
136 return;
137 }
138 if (m_Data->orig_score_matrix) {
139 sfree(m_Data->orig_score_matrix);
140 }
141 if (m_Data->karlin_k) {
142 delete [] m_Data->karlin_k;
143 m_Data->karlin_k = NULL;
144 }
145 delete m_Data;
146 m_Data = NULL;
147 }
148
149 /////////////////////////////////////////////////////////////////////////////
150 //
151 // CRpsAuxFile
152 //
153 /////////////////////////////////////////////////////////////////////////////
154
155 /// This class represents the .aux file in a RPS-BLAST file, which contains
156 /// information about the scoring matrix to be used during the RPS-BLAST
157 /// search, the scaling factor, an array of K statistical values (karlin_k),
158 /// as well as various fields that are currently unused.
159 class CRpsAuxFile : public CObject {
160 public:
161 /// Extension associated with the RPS-BLAST database auxiliary file
162 static const string kExtension;
163
164 /// Parametrized constructor
165 /// @param filename_no_extn name of the file without extension
166 CRpsAuxFile(const string& filename_no_extn);
167
168 /// Lend the caller the pointer to the data structure this object manages.
169 /// Caller MUST NOT deallocate the return value.
170 const BlastRPSAuxInfo* operator()() const;
171
172 private:
173 /// Auxiliary method to read the contents of the file into m_Data
174 CRef<CBlastRPSAuxInfo> x_ReadFromFile(CNcbiIfstream& input);
175 CRef<CBlastRPSAuxInfo> m_Data;
176 };
177
178 const string CRpsAuxFile::kExtension(".aux");
179
CRpsAuxFile(const string & filename_no_extn)180 CRpsAuxFile::CRpsAuxFile(const string& filename_no_extn)
181 {
182 // Open the file
183 const string file2open(filename_no_extn + kExtension);
184 CNcbiIfstream auxfile(file2open.c_str());
185 if (auxfile.bad() || auxfile.fail()) {
186 NCBI_THROW(CBlastException, eRpsInit,
187 "Cannot open RPS-BLAST auxiliary file: " + file2open);
188 }
189 // Read the file
190 m_Data = x_ReadFromFile(auxfile);
191 auxfile.close();
192 }
193
194 CRef<CBlastRPSAuxInfo>
x_ReadFromFile(CNcbiIfstream & input)195 CRpsAuxFile::x_ReadFromFile(CNcbiIfstream& input)
196 {
197 // Declare data to read
198 string matrix;
199 int gap_open;
200 int gap_extend;
201 double scale_factor;
202 vector<double> karlinK;
203 double ignore_me_d;
204 int ignore_me_i;
205
206 // Read the data
207 input >> matrix;
208 input >> gap_open;
209 input >> gap_extend;
210 input >> ignore_me_d; // corresponds to BlastRPSAuxInfo::ungapped_k
211 input >> ignore_me_d; // corresponds to BlastRPSAuxInfo::ungapped_h
212 input >> ignore_me_i; // corresponds to BlastRPSAuxInfo::max_db_seq_length
213 input >> ignore_me_i; // corresponds to BlastRPSAuxInfo::db_length
214 input >> scale_factor;
215 while (input) {
216 int seq_size; // unused value
217 double k;
218 input >> seq_size;
219 input >> k;
220 karlinK.push_back(k);
221 }
222
223 CRef<CBlastRPSAuxInfo> retval(new CBlastRPSAuxInfo(matrix,
224 gap_open,
225 gap_extend,
226 scale_factor,
227 karlinK));
228 return retval;
229 }
230
231 const BlastRPSAuxInfo*
operator ()() const232 CRpsAuxFile::operator()() const
233 {
234 return (*m_Data)();
235 }
236
237 /////////////////////////////////////////////////////////////////////////////
238 //
239 // CRpsMmappedFile
240 //
241 /////////////////////////////////////////////////////////////////////////////
242
243 /// Encapsulates logic of mmap'ing and performing sanity checks on RPS-BLAST
244 /// database files
245 class CRpsMmappedFile : public CObject {
246 public:
247 /// Parametrized constructor
248 /// @param filename name of the file to mmap
249 CRpsMmappedFile(const string& filename);
250 protected:
251 /// The data structure this class manages
252 unique_ptr<CMemoryFile> m_MmappedFile;
253 };
254
CRpsMmappedFile(const string & filename)255 CRpsMmappedFile::CRpsMmappedFile(const string& filename)
256 {
257 try { m_MmappedFile.reset(new CMemoryFile(filename)); }
258 catch (const CFileException& e) {
259 NCBI_RETHROW(e, CBlastException, eRpsInit,
260 "Cannot memory map RPS-BLAST database file: " + filename);
261 }
262 }
263
264 /////////////////////////////////////////////////////////////////////////////
265 //
266 // CRpsLookupTblFile
267 //
268 /////////////////////////////////////////////////////////////////////////////
269
270 /// This class represents the .loo file in a RPS-BLAST file, which contains the
271 /// pre-computed lookup table
272 class CRpsLookupTblFile : public CRpsMmappedFile {
273 public:
274 /// Extension associated with the RPS-BLAST database lookup table file
275 static const string kExtension;
276
277 /// Parametrized constructor
278 /// @param filename_no_extn name of the file without extension
279 CRpsLookupTblFile(const string& filename_no_extn);
280
281 /// Lend the caller the pointer to the data structure this object manages.
282 /// Caller MUST NOT deallocate the return value.
283 const BlastRPSLookupFileHeader* operator()() const;
284 private:
285 /// The data structure this class manages
286 BlastRPSLookupFileHeader* m_Data;
287 };
288
289 const string CRpsLookupTblFile::kExtension(".loo");
290
CRpsLookupTblFile(const string & filename_no_extn)291 CRpsLookupTblFile::CRpsLookupTblFile(const string& filename_no_extn)
292 : CRpsMmappedFile(filename_no_extn + kExtension)
293 {
294 m_Data = (BlastRPSLookupFileHeader*) m_MmappedFile->GetPtr();
295 if (m_Data->magic_number != RPS_MAGIC_NUM &&
296 m_Data->magic_number != RPS_MAGIC_NUM_28) {
297 m_Data = NULL;
298 NCBI_THROW(CBlastException, eRpsInit,
299 "RPS BLAST profile file (" + filename_no_extn + kExtension +
300 ") is either corrupt or constructed for an incompatible "
301 "architecture");
302 }
303 }
304
305 const BlastRPSLookupFileHeader*
operator ()() const306 CRpsLookupTblFile::operator()() const
307 {
308 return m_Data;
309 }
310
311 /////////////////////////////////////////////////////////////////////////////
312 //
313 // CRpsPssmFile
314 //
315 /////////////////////////////////////////////////////////////////////////////
316
317 /// This class represents the .rps file in a RPS-BLAST file, which contains the
318 /// PSSMs for the database
319 class CRpsPssmFile : public CRpsMmappedFile {
320 public:
321 /// Extension associated with the RPS-BLAST database PSSM file
322 static const string kExtension;
323
324 /// Parametrized constructor
325 /// @param filename_no_extn name of the file without extension
326 CRpsPssmFile(const string& filename_no_extn);
327
328 /// Lend the caller the pointer to the data structure this object manages.
329 /// Caller MUST NOT deallocate the return value.
330 const BlastRPSProfileHeader* operator()() const;
331 private:
332 /// The data structure this class manages
333 BlastRPSProfileHeader* m_Data;
334 };
335
336 const string CRpsPssmFile::kExtension(".rps");
337
CRpsPssmFile(const string & filename_no_extn)338 CRpsPssmFile::CRpsPssmFile(const string& filename_no_extn)
339 : CRpsMmappedFile(filename_no_extn + kExtension)
340 {
341 m_Data = (BlastRPSProfileHeader*) m_MmappedFile->GetPtr();
342 if (m_Data->magic_number != RPS_MAGIC_NUM &&
343 m_Data->magic_number != RPS_MAGIC_NUM_28) {
344 m_Data = NULL;
345 NCBI_THROW(CBlastException, eRpsInit,
346 "RPS BLAST profile file (" + filename_no_extn + kExtension +
347 ") is either corrupt or constructed for an incompatible "
348 "architecture");
349 }
350 }
351
352 const BlastRPSProfileHeader*
operator ()() const353 CRpsPssmFile::operator()() const
354 {
355 return m_Data;
356 }
357
358
359 /////////////////////////////////////////////////////////////////////////////
360 //
361 // CRpsFreqsFile
362 //
363 /////////////////////////////////////////////////////////////////////////////
364
365 /// This class represents the .wcounts file in a RPS-BLAST file, which contains
366 /// the weighted residue frequencies for the database
367 class CRpsFreqsFile : public CRpsMmappedFile {
368 public:
369 /// Extension associated with the RPS-BLAST database PSSM file
370 static const string kExtension;
371
372 /// Parametrized constructor
373 /// @param filename_no_extn name of the file without extension
374 CRpsFreqsFile(const string& filename_no_extn);
375
376 /// Lend the caller the pointer to the data structure this object manages.
377 /// Caller MUST NOT deallocate the return value.
378 const BlastRPSProfileHeader* operator()() const;
379 private:
380 /// The data this class manages
381 BlastRPSProfileHeader* m_Data;
382 };
383
384 const string CRpsFreqsFile::kExtension(".wcounts");
385
CRpsFreqsFile(const string & filename_no_extn)386 CRpsFreqsFile::CRpsFreqsFile(const string& filename_no_extn)
387 : CRpsMmappedFile(filename_no_extn + kExtension)
388 {
389
390 m_Data = (BlastRPSProfileHeader*)m_MmappedFile->GetPtr();
391
392 if (m_Data->magic_number != RPS_MAGIC_NUM &&
393 m_Data->magic_number != RPS_MAGIC_NUM_28) {
394 m_Data = NULL;
395 NCBI_THROW(CBlastException, eRpsInit,
396 "RPS BLAST profile file (" + filename_no_extn + kExtension +
397 ") is either corrupt or constructed for an incompatible "
398 "architecture");
399 }
400 }
401
402 const BlastRPSProfileHeader*
operator ()() const403 CRpsFreqsFile::operator()() const
404 {
405 return m_Data;
406 }
407
408
409 /////////////////////////////////////////////////////////////////////////////
410 //
411 // CRpsObsrFile
412 //
413 /////////////////////////////////////////////////////////////////////////////
414
415 /// This class represents the .obsr file in a RPS-BLAST file, which contains
416 /// the numbers of independent observations for the database
417 class CRpsObsrFile : public CRpsMmappedFile {
418 public:
419 /// Extension associated with the RPS-BLAST database PSSM file
420 static const string kExtension;
421
422 /// Parametrized constructor
423 /// @param filename_no_extn name of the file without extension
424 CRpsObsrFile(const string& filename_no_extn);
425
426 /// Lend the caller the pointer to the data structure this object manages.
427 /// Caller MUST NOT deallocate the return value.
428 const BlastRPSProfileHeader* operator()() const;
429 private:
430 /// Header
431 BlastRPSProfileHeader* m_Data;
432 };
433
434 const string CRpsObsrFile::kExtension(".obsr");
435
CRpsObsrFile(const string & filename_no_extn)436 CRpsObsrFile::CRpsObsrFile(const string& filename_no_extn)
437 : CRpsMmappedFile(filename_no_extn + kExtension)
438 {
439 m_Data = (BlastRPSProfileHeader*)m_MmappedFile->GetPtr();
440
441 if (m_Data->magic_number != RPS_MAGIC_NUM &&
442 m_Data->magic_number != RPS_MAGIC_NUM_28) {
443 m_Data = NULL;
444 NCBI_THROW(CBlastException, eRpsInit,
445 "RPS BLAST profile file (" + filename_no_extn + kExtension +
446 ") is either corrupt or constructed for an incompatible "
447 "architecture");
448 }
449 }
450
451 const BlastRPSProfileHeader*
operator ()() const452 CRpsObsrFile::operator()() const
453 {
454 return m_Data;
455 }
456
457 /////////////////////////////////////////////////////////////////////////////
458 //
459 // CRpsFreqRatiosFile
460 //
461 /////////////////////////////////////////////////////////////////////////////
462
463 /// This class represents the .freq file in a RPS-BLAST file, which contains
464 /// the frequency ratios for the database
465 class CRpsFreqRatiosFile : public CRpsMmappedFile {
466 public:
467 /// Extension associated with the RPS-BLAST database PSSM file
468 static const string kExtension;
469
470 /// Parametrized constructor
471 /// @param filename_no_extn name of the file without extension
472 CRpsFreqRatiosFile(const string& filename_no_extn);
473
~CRpsFreqRatiosFile()474 virtual ~CRpsFreqRatiosFile(){};
475
476 /// Lend the caller the pointer to the data structure this object manages.
477 /// Caller MUST NOT deallocate the return value.
478 const BlastRPSFreqRatiosHeader* operator()() const;
479 private:
480 /// The data this class manages
481 BlastRPSFreqRatiosHeader* m_Data;
482 };
483
484 const string CRpsFreqRatiosFile::kExtension(".freq");
485
CRpsFreqRatiosFile(const string & filename_no_extn)486 CRpsFreqRatiosFile::CRpsFreqRatiosFile(const string& filename_no_extn)
487 : CRpsMmappedFile(filename_no_extn + kExtension), m_Data(NULL)
488 {
489 m_Data = (BlastRPSFreqRatiosHeader *) m_MmappedFile->GetPtr();
490 if (m_Data->magic_number != RPS_MAGIC_NUM &&
491 m_Data->magic_number != RPS_MAGIC_NUM_28) {
492 m_Data = NULL;
493 NCBI_THROW(CBlastException, eRpsInit,
494 "RPS BLAST freq ratios file (" + filename_no_extn + kExtension +
495 ") is either corrupt or constructed for an incompatible "
496 "architecture");
497 }
498
499 }
500
501 const BlastRPSFreqRatiosHeader*
operator ()() const502 CRpsFreqRatiosFile::operator()() const
503 {
504 return m_Data;
505 }
506
507
CBlastRPSInfo(const string & rps_dbname)508 CBlastRPSInfo::CBlastRPSInfo(const string& rps_dbname)
509 {
510 x_Init(rps_dbname, fRpsBlast);
511 }
512
CBlastRPSInfo(const string & rps_dbname,int flags)513 CBlastRPSInfo::CBlastRPSInfo(const string& rps_dbname, int flags)
514 {
515 x_Init(rps_dbname, flags);
516 }
517
x_Init(const string & rps_dbname,int flags)518 void CBlastRPSInfo::x_Init(const string& rps_dbname, int flags)
519 {
520 // Obtain the full path to the database
521 string path;
522 try {
523 vector<string> dbpath;
524 CSeqDB::FindVolumePaths(rps_dbname, CSeqDB::eProtein, dbpath);
525 path = *dbpath.begin();
526 } catch (const CSeqDBException& e) {
527 NCBI_RETHROW(e, CBlastException, eRpsInit,
528 "Cannot retrieve path to RPS database");
529 }
530 _ASSERT(!path.empty());
531
532 unique_ptr<BlastRPSInfo> rps_info;
533
534 // Allocate the core data structure
535 try { m_RpsInfo.reset(new BlastRPSInfo); }
536 catch (const bad_alloc&) {
537 NCBI_THROW(CBlastSystemException, eOutOfMemory,
538 "RPSInfo allocation failed");
539 }
540
541 // Assign the pointers to the core data structure
542 m_RpsInfo->lookup_header = NULL;
543 m_RpsInfo->profile_header = NULL;
544 m_RpsInfo->freq_header = NULL;
545 m_RpsInfo->obsr_header = NULL;
546 m_RpsInfo->freq_ratios_header = NULL;
547
548 // Load the various files
549 if (flags & fAuxInfoFile) {
550 m_AuxFile.Reset(new CRpsAuxFile(path));
551
552 // Note that these const_casts are only needed because the data structure
553 // doesn't take const pointers, but these won't be modified at all
554 m_RpsInfo->aux_info =
555 *const_cast<BlastRPSAuxInfo*>((*m_AuxFile)());
556 }
557
558 if (flags & fLookupTableFile) {
559 m_LutFile.Reset(new CRpsLookupTblFile(path));
560
561 // Note that these const_casts are only needed because the data structure
562 // doesn't take const pointers, but these won't be modified at all
563 m_RpsInfo->lookup_header =
564 const_cast<BlastRPSLookupFileHeader*>((*m_LutFile)());
565 }
566
567 if (flags & fPssmFile) {
568 m_PssmFile.Reset(new CRpsPssmFile(path));
569
570 // Note that these const_casts are only needed because the data structure
571 // doesn't take const pointers, but these won't be modified at all
572 m_RpsInfo->profile_header =
573 const_cast<BlastRPSProfileHeader*>((*m_PssmFile)());
574 }
575
576 if (flags & fFrequenciesFile) {
577 m_FreqsFile.Reset(new CRpsFreqsFile(path));
578
579 // Note that these const_casts are only needed because the data structure
580 // doesn't take const pointers, but these won't be modified at all
581 m_RpsInfo->freq_header =
582 const_cast<BlastRPSProfileHeader*>((*m_FreqsFile)());
583 }
584
585 if (flags & fObservationsFile) {
586 m_ObsrFile.Reset(new CRpsObsrFile(path));
587
588 // Note that these const_casts are only needed because the data structure
589 // doesn't take const pointers, but these won't be modified at all
590 m_RpsInfo->obsr_header =
591 const_cast<BlastRPSProfileHeader*>((*m_ObsrFile)());
592 }
593
594 if (flags & fFreqRatiosFile) {
595 try {
596 // read frequency ratios data
597 m_FreqRatiosFile.Reset(new CRpsFreqRatiosFile(path));
598 } catch (const CBlastException& e) {
599 string msg = rps_dbname + " contains no frequency ratios needed for composition-based statistics.\n" \
600 "Please disable composition-based statistics when searching against " + rps_dbname + ".";
601 NCBI_RETHROW(e, CBlastException, eRpsInit, msg);
602 }
603 m_RpsInfo->freq_ratios_header =
604 const_cast<BlastRPSFreqRatiosHeader*>((*m_FreqRatiosFile)());
605 }
606 }
607
608 // Trivial at this point, but left out-of-line so that the header doesn't
609 // need to pull in full declarations of the classes to which it takes CRefs.
~CBlastRPSInfo()610 CBlastRPSInfo::~CBlastRPSInfo()
611 {
612 }
613
614 const BlastRPSInfo*
operator ()() const615 CBlastRPSInfo::operator()() const
616 {
617 return m_RpsInfo.get();
618 }
619
620 double
GetScalingFactor() const621 CBlastRPSInfo::GetScalingFactor() const
622 {
623 return m_RpsInfo->aux_info.scale_factor;
624 }
625
626 const char*
GetMatrixName() const627 CBlastRPSInfo::GetMatrixName() const
628 {
629 return m_RpsInfo->aux_info.orig_score_matrix;
630 }
631
632 int
GetGapOpeningCost() const633 CBlastRPSInfo::GetGapOpeningCost() const
634 {
635 return m_RpsInfo->aux_info.gap_open_penalty;
636 }
637
638 int
GetGapExtensionCost() const639 CBlastRPSInfo::GetGapExtensionCost() const
640 {
641 return m_RpsInfo->aux_info.gap_extend_penalty;
642 }
643
644 END_SCOPE(blast)
645 END_NCBI_SCOPE
646
647 /* @} */
648