1 /* ===========================================================================
2  *
3  *                            PUBLIC DOMAIN NOTICE
4  *               National Center for Biotechnology Information
5  *
6  *  This software/database is a "United States Government Work" under the
7  *  terms of the United States Copyright Act.  It was written as part of
8  *  the author's official duties as a United States Government employee and
9  *  thus cannot be copyrighted.  This software/database is freely available
10  *  to the public for use. The National Library of Medicine and the U.S.
11  *  Government have not placed any restriction on its use or reproduction.
12  *
13  *  Although all reasonable efforts have been taken to ensure the accuracy
14  *  and reliability of the software and data, the NLM and the U.S.
15  *  Government do not and cannot warrant the performance or results that
16  *  may be obtained by using this software or data. The NLM and the U.S.
17  *  Government disclaim all warranties, express or implied, including
18  *  warranties of performance, merchantability or fitness for any particular
19  *  purpose.
20  *
21  *  Please cite the author in any work or product based on this material.
22  *
23  * ===========================================================================
24  *
25  * Author:  Christiam Camacho
26  *
27  */
28 
29 /** @file rps_aux.cpp
30  * Implements auxiliary classes to manage RPS-BLAST related C-structures
31  */
32 
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <algo/blast/api/blast_exception.hpp>
37 #include <objtools/blast/seqdb_reader/seqdb.hpp>
38 #include <algo/blast/api/rps_aux.hpp>
39 #include <algo/blast/core/blast_def.h>
40 #include <memory>
41 
42 /** @addtogroup AlgoBlast
43  *
44  * @{
45  */
46 
47 BEGIN_NCBI_SCOPE
48 BEGIN_SCOPE(blast)
49 
50 /////////////////////////////////////////////////////////////////////////////
51 //
52 // CBlastRPSAuxInfo
53 //
54 /////////////////////////////////////////////////////////////////////////////
55 
56 /// Wrapper class to manage the BlastRPSAuxInfo structure, as currently
57 /// there aren't any allocation or deallocation functions for this structure in
58 /// the CORE of BLAST. This class is meant to be kept in a CRef<>. Note that
59 /// only the fields currently used are being passed to the constructor
60 class CBlastRPSAuxInfo : public CObject {
61 public:
62     /// Parametrized constructor
63     /// @param matrix name of the scoring matrix used to build the RPS-BLAST
64     /// database
65     /// @param gap_open gap opening cost
66     /// @param gap_extend gap extension cost
67     /// @param scale_factor scaling factor
68     /// @param karlin_k statistical K parameter calculated when building the
69     /// RPS-BLAST database
70     CBlastRPSAuxInfo(const string& matrix,
71                      int gap_open,
72                      int gap_extend,
73                      double scale_factor,
74                      const vector<double>& karlin_k);
75 
76     /// Destructor
77     ~CBlastRPSAuxInfo();
78 
79     /// Lend the caller the pointer to the data structure this object manages.
80     /// Caller MUST NOT deallocate the return value.
81     const BlastRPSAuxInfo* operator()() const;
82 private:
83     /// Prohibit copy-constructor
84     CBlastRPSAuxInfo(const CBlastRPSAuxInfo& rhs);
85     /// Prohibit assignment operator
86     CBlastRPSAuxInfo& operator=(const CBlastRPSAuxInfo& rhs);
87 
88     /// Deallocates the structure owned by this class
89     void x_DoDestroy();
90 
91     /// The data structure this class manages
92     BlastRPSAuxInfo* m_Data;
93 };
94 
CBlastRPSAuxInfo(const string & matrix,int gap_open,int gap_extend,double scale_factor,const vector<double> & karlin_k)95 CBlastRPSAuxInfo::CBlastRPSAuxInfo(const string& matrix,
96                                    int gap_open,
97                                    int gap_extend,
98                                    double scale_factor,
99                                    const vector<double>& karlin_k)
100     : m_Data(0)
101 {
102     _ASSERT(!matrix.empty());
103     _ASSERT(!karlin_k.empty());
104 
105     try {
106         m_Data = new BlastRPSAuxInfo;
107         memset(m_Data, 0, sizeof(BlastRPSAuxInfo));
108         m_Data->orig_score_matrix = strdup(matrix.c_str());
109         m_Data->gap_open_penalty = gap_open;
110         m_Data->gap_extend_penalty = gap_extend;
111         m_Data->scale_factor = scale_factor;
112         m_Data->karlin_k = new double[karlin_k.size()];
113         copy(karlin_k.begin(), karlin_k.end(), &m_Data->karlin_k[0]);
114     } catch (const bad_alloc&) {
115         x_DoDestroy();
116         NCBI_THROW(CBlastSystemException, eOutOfMemory,
117                "Failed to allocate memory for BlastRPSAuxInfo structure");
118     }
119 }
120 
~CBlastRPSAuxInfo()121 CBlastRPSAuxInfo::~CBlastRPSAuxInfo()
122 {
123     x_DoDestroy();
124 }
125 
126 const BlastRPSAuxInfo*
operator ()() const127 CBlastRPSAuxInfo::operator()() const
128 {
129     return m_Data;
130 }
131 
132 void
x_DoDestroy()133 CBlastRPSAuxInfo::x_DoDestroy()
134 {
135     if ( !m_Data ) {
136         return;
137     }
138     if (m_Data->orig_score_matrix) {
139         sfree(m_Data->orig_score_matrix);
140     }
141     if (m_Data->karlin_k) {
142         delete [] m_Data->karlin_k;
143         m_Data->karlin_k = NULL;
144     }
145     delete m_Data;
146     m_Data = NULL;
147 }
148 
149 /////////////////////////////////////////////////////////////////////////////
150 //
151 // CRpsAuxFile
152 //
153 /////////////////////////////////////////////////////////////////////////////
154 
155 /// This class represents the .aux file in a RPS-BLAST file, which contains
156 /// information about the scoring matrix to be used during the RPS-BLAST
157 /// search, the scaling factor, an array of K statistical values (karlin_k),
158 /// as well as various fields that are currently unused.
159 class CRpsAuxFile : public CObject {
160 public:
161     /// Extension associated with the RPS-BLAST database auxiliary file
162     static const string kExtension;
163 
164     /// Parametrized constructor
165     /// @param filename_no_extn name of the file without extension
166     CRpsAuxFile(const string& filename_no_extn);
167 
168     /// Lend the caller the pointer to the data structure this object manages.
169     /// Caller MUST NOT deallocate the return value.
170     const BlastRPSAuxInfo* operator()() const;
171 
172 private:
173     /// Auxiliary method to read the contents of the file into m_Data
174     CRef<CBlastRPSAuxInfo> x_ReadFromFile(CNcbiIfstream& input);
175     CRef<CBlastRPSAuxInfo> m_Data;
176 };
177 
178 const string CRpsAuxFile::kExtension(".aux");
179 
CRpsAuxFile(const string & filename_no_extn)180 CRpsAuxFile::CRpsAuxFile(const string& filename_no_extn)
181 {
182     // Open the file
183     const string file2open(filename_no_extn + kExtension);
184     CNcbiIfstream auxfile(file2open.c_str());
185     if (auxfile.bad() || auxfile.fail()) {
186         NCBI_THROW(CBlastException, eRpsInit,
187                    "Cannot open RPS-BLAST auxiliary file: " + file2open);
188     }
189     // Read the file
190     m_Data = x_ReadFromFile(auxfile);
191     auxfile.close();
192 }
193 
194 CRef<CBlastRPSAuxInfo>
x_ReadFromFile(CNcbiIfstream & input)195 CRpsAuxFile::x_ReadFromFile(CNcbiIfstream& input)
196 {
197     // Declare data to read
198     string matrix;
199     int gap_open;
200     int gap_extend;
201     double scale_factor;
202     vector<double> karlinK;
203     double ignore_me_d;
204     int ignore_me_i;
205 
206     // Read the data
207     input >> matrix;
208     input >> gap_open;
209     input >> gap_extend;
210     input >> ignore_me_d; // corresponds to BlastRPSAuxInfo::ungapped_k
211     input >> ignore_me_d; // corresponds to BlastRPSAuxInfo::ungapped_h
212     input >> ignore_me_i; // corresponds to BlastRPSAuxInfo::max_db_seq_length
213     input >> ignore_me_i; // corresponds to BlastRPSAuxInfo::db_length
214     input >> scale_factor;
215     while (input) {
216         int seq_size;   // unused value
217         double k;
218         input >> seq_size;
219         input >> k;
220         karlinK.push_back(k);
221     }
222 
223     CRef<CBlastRPSAuxInfo> retval(new CBlastRPSAuxInfo(matrix,
224                                                        gap_open,
225                                                        gap_extend,
226                                                        scale_factor,
227                                                        karlinK));
228     return retval;
229 }
230 
231 const BlastRPSAuxInfo*
operator ()() const232 CRpsAuxFile::operator()() const
233 {
234     return (*m_Data)();
235 }
236 
237 /////////////////////////////////////////////////////////////////////////////
238 //
239 // CRpsMmappedFile
240 //
241 /////////////////////////////////////////////////////////////////////////////
242 
243 /// Encapsulates logic of mmap'ing and performing sanity checks on RPS-BLAST
244 /// database files
245 class CRpsMmappedFile : public CObject {
246 public:
247     /// Parametrized constructor
248     /// @param filename name of the file to mmap
249     CRpsMmappedFile(const string& filename);
250 protected:
251     /// The data structure this class manages
252     unique_ptr<CMemoryFile> m_MmappedFile;
253 };
254 
CRpsMmappedFile(const string & filename)255 CRpsMmappedFile::CRpsMmappedFile(const string& filename)
256 {
257     try { m_MmappedFile.reset(new CMemoryFile(filename)); }
258     catch (const CFileException& e) {
259         NCBI_RETHROW(e, CBlastException, eRpsInit,
260              "Cannot memory map RPS-BLAST database file: " + filename);
261     }
262 }
263 
264 /////////////////////////////////////////////////////////////////////////////
265 //
266 // CRpsLookupTblFile
267 //
268 /////////////////////////////////////////////////////////////////////////////
269 
270 /// This class represents the .loo file in a RPS-BLAST file, which contains the
271 /// pre-computed lookup table
272 class CRpsLookupTblFile : public CRpsMmappedFile {
273 public:
274     /// Extension associated with the RPS-BLAST database lookup table file
275     static const string kExtension;
276 
277     /// Parametrized constructor
278     /// @param filename_no_extn name of the file without extension
279     CRpsLookupTblFile(const string& filename_no_extn);
280 
281     /// Lend the caller the pointer to the data structure this object manages.
282     /// Caller MUST NOT deallocate the return value.
283     const BlastRPSLookupFileHeader* operator()() const;
284 private:
285     /// The data structure this class manages
286     BlastRPSLookupFileHeader* m_Data;
287 };
288 
289 const string CRpsLookupTblFile::kExtension(".loo");
290 
CRpsLookupTblFile(const string & filename_no_extn)291 CRpsLookupTblFile::CRpsLookupTblFile(const string& filename_no_extn)
292     : CRpsMmappedFile(filename_no_extn + kExtension)
293 {
294     m_Data = (BlastRPSLookupFileHeader*) m_MmappedFile->GetPtr();
295     if (m_Data->magic_number != RPS_MAGIC_NUM &&
296         m_Data->magic_number != RPS_MAGIC_NUM_28) {
297         m_Data = NULL;
298         NCBI_THROW(CBlastException, eRpsInit,
299                "RPS BLAST profile file (" + filename_no_extn + kExtension +
300                ") is either corrupt or constructed for an incompatible "
301                "architecture");
302     }
303 }
304 
305 const BlastRPSLookupFileHeader*
operator ()() const306 CRpsLookupTblFile::operator()() const
307 {
308     return m_Data;
309 }
310 
311 /////////////////////////////////////////////////////////////////////////////
312 //
313 // CRpsPssmFile
314 //
315 /////////////////////////////////////////////////////////////////////////////
316 
317 /// This class represents the .rps file in a RPS-BLAST file, which contains the
318 /// PSSMs for the database
319 class CRpsPssmFile : public CRpsMmappedFile {
320 public:
321     /// Extension associated with the RPS-BLAST database PSSM file
322     static const string kExtension;
323 
324     /// Parametrized constructor
325     /// @param filename_no_extn name of the file without extension
326     CRpsPssmFile(const string& filename_no_extn);
327 
328     /// Lend the caller the pointer to the data structure this object manages.
329     /// Caller MUST NOT deallocate the return value.
330     const BlastRPSProfileHeader* operator()() const;
331 private:
332     /// The data structure this class manages
333     BlastRPSProfileHeader* m_Data;
334 };
335 
336 const string CRpsPssmFile::kExtension(".rps");
337 
CRpsPssmFile(const string & filename_no_extn)338 CRpsPssmFile::CRpsPssmFile(const string& filename_no_extn)
339     : CRpsMmappedFile(filename_no_extn + kExtension)
340 {
341     m_Data = (BlastRPSProfileHeader*) m_MmappedFile->GetPtr();
342     if (m_Data->magic_number != RPS_MAGIC_NUM &&
343         m_Data->magic_number != RPS_MAGIC_NUM_28) {
344         m_Data = NULL;
345         NCBI_THROW(CBlastException, eRpsInit,
346                "RPS BLAST profile file (" + filename_no_extn + kExtension +
347                ") is either corrupt or constructed for an incompatible "
348                "architecture");
349     }
350 }
351 
352 const BlastRPSProfileHeader*
operator ()() const353 CRpsPssmFile::operator()() const
354 {
355     return m_Data;
356 }
357 
358 
359 /////////////////////////////////////////////////////////////////////////////
360 //
361 // CRpsFreqsFile
362 //
363 /////////////////////////////////////////////////////////////////////////////
364 
365 /// This class represents the .wcounts file in a RPS-BLAST file, which contains
366 /// the weighted residue frequencies for the database
367 class CRpsFreqsFile : public CRpsMmappedFile {
368 public:
369     /// Extension associated with the RPS-BLAST database PSSM file
370     static const string kExtension;
371 
372     /// Parametrized constructor
373     /// @param filename_no_extn name of the file without extension
374     CRpsFreqsFile(const string& filename_no_extn);
375 
376     /// Lend the caller the pointer to the data structure this object manages.
377     /// Caller MUST NOT deallocate the return value.
378     const BlastRPSProfileHeader* operator()() const;
379 private:
380     /// The data this class manages
381     BlastRPSProfileHeader* m_Data;
382 };
383 
384 const string CRpsFreqsFile::kExtension(".wcounts");
385 
CRpsFreqsFile(const string & filename_no_extn)386 CRpsFreqsFile::CRpsFreqsFile(const string& filename_no_extn)
387     : CRpsMmappedFile(filename_no_extn + kExtension)
388 {
389 
390     m_Data = (BlastRPSProfileHeader*)m_MmappedFile->GetPtr();
391 
392     if (m_Data->magic_number != RPS_MAGIC_NUM &&
393         m_Data->magic_number != RPS_MAGIC_NUM_28) {
394         m_Data = NULL;
395         NCBI_THROW(CBlastException, eRpsInit,
396                "RPS BLAST profile file (" + filename_no_extn + kExtension +
397                ") is either corrupt or constructed for an incompatible "
398                "architecture");
399     }
400 }
401 
402 const BlastRPSProfileHeader*
operator ()() const403 CRpsFreqsFile::operator()() const
404 {
405     return m_Data;
406 }
407 
408 
409 /////////////////////////////////////////////////////////////////////////////
410 //
411 // CRpsObsrFile
412 //
413 /////////////////////////////////////////////////////////////////////////////
414 
415 /// This class represents the .obsr file in a RPS-BLAST file, which contains
416 /// the numbers of independent observations for the database
417 class CRpsObsrFile : public CRpsMmappedFile {
418 public:
419     /// Extension associated with the RPS-BLAST database PSSM file
420     static const string kExtension;
421 
422     /// Parametrized constructor
423     /// @param filename_no_extn name of the file without extension
424     CRpsObsrFile(const string& filename_no_extn);
425 
426     /// Lend the caller the pointer to the data structure this object manages.
427     /// Caller MUST NOT deallocate the return value.
428     const BlastRPSProfileHeader* operator()() const;
429 private:
430     /// Header
431     BlastRPSProfileHeader* m_Data;
432 };
433 
434 const string CRpsObsrFile::kExtension(".obsr");
435 
CRpsObsrFile(const string & filename_no_extn)436 CRpsObsrFile::CRpsObsrFile(const string& filename_no_extn)
437     : CRpsMmappedFile(filename_no_extn + kExtension)
438 {
439     m_Data = (BlastRPSProfileHeader*)m_MmappedFile->GetPtr();
440 
441     if (m_Data->magic_number != RPS_MAGIC_NUM &&
442         m_Data->magic_number != RPS_MAGIC_NUM_28) {
443         m_Data = NULL;
444         NCBI_THROW(CBlastException, eRpsInit,
445                "RPS BLAST profile file (" + filename_no_extn + kExtension +
446                ") is either corrupt or constructed for an incompatible "
447                "architecture");
448     }
449 }
450 
451 const BlastRPSProfileHeader*
operator ()() const452 CRpsObsrFile::operator()() const
453 {
454     return m_Data;
455 }
456 
457 /////////////////////////////////////////////////////////////////////////////
458 //
459 // CRpsFreqRatiosFile
460 //
461 /////////////////////////////////////////////////////////////////////////////
462 
463 /// This class represents the .freq file in a RPS-BLAST file, which contains
464 /// the frequency ratios for the database
465 class CRpsFreqRatiosFile : public CRpsMmappedFile {
466 public:
467     /// Extension associated with the RPS-BLAST database PSSM file
468     static const string kExtension;
469 
470     /// Parametrized constructor
471     /// @param filename_no_extn name of the file without extension
472     CRpsFreqRatiosFile(const string& filename_no_extn);
473 
~CRpsFreqRatiosFile()474     virtual ~CRpsFreqRatiosFile(){};
475 
476     /// Lend the caller the pointer to the data structure this object manages.
477     /// Caller MUST NOT deallocate the return value.
478     const BlastRPSFreqRatiosHeader* operator()() const;
479 private:
480     /// The data this class manages
481     BlastRPSFreqRatiosHeader* m_Data;
482 };
483 
484 const string CRpsFreqRatiosFile::kExtension(".freq");
485 
CRpsFreqRatiosFile(const string & filename_no_extn)486 CRpsFreqRatiosFile::CRpsFreqRatiosFile(const string& filename_no_extn)
487     : CRpsMmappedFile(filename_no_extn + kExtension), m_Data(NULL)
488 {
489     m_Data = (BlastRPSFreqRatiosHeader *) m_MmappedFile->GetPtr();
490     if (m_Data->magic_number != RPS_MAGIC_NUM &&
491             m_Data->magic_number != RPS_MAGIC_NUM_28) {
492             m_Data = NULL;
493             NCBI_THROW(CBlastException, eRpsInit,
494                    "RPS BLAST freq ratios file (" + filename_no_extn + kExtension +
495                    ") is either corrupt or constructed for an incompatible "
496                    "architecture");
497     }
498 
499 }
500 
501 const BlastRPSFreqRatiosHeader*
operator ()() const502 CRpsFreqRatiosFile::operator()() const
503 {
504     return m_Data;
505 }
506 
507 
CBlastRPSInfo(const string & rps_dbname)508 CBlastRPSInfo::CBlastRPSInfo(const string& rps_dbname)
509 {
510     x_Init(rps_dbname, fRpsBlast);
511 }
512 
CBlastRPSInfo(const string & rps_dbname,int flags)513 CBlastRPSInfo::CBlastRPSInfo(const string& rps_dbname, int flags)
514 {
515     x_Init(rps_dbname, flags);
516 }
517 
x_Init(const string & rps_dbname,int flags)518 void CBlastRPSInfo::x_Init(const string& rps_dbname, int flags)
519 {
520     // Obtain the full path to the database
521     string path;
522     try {
523         vector<string> dbpath;
524         CSeqDB::FindVolumePaths(rps_dbname, CSeqDB::eProtein, dbpath);
525         path = *dbpath.begin();
526     } catch (const CSeqDBException& e) {
527         NCBI_RETHROW(e, CBlastException, eRpsInit,
528                      "Cannot retrieve path to RPS database");
529     }
530     _ASSERT(!path.empty());
531 
532     unique_ptr<BlastRPSInfo> rps_info;
533 
534     // Allocate the core data structure
535     try { m_RpsInfo.reset(new BlastRPSInfo); }
536     catch (const bad_alloc&) {
537         NCBI_THROW(CBlastSystemException, eOutOfMemory,
538                    "RPSInfo allocation failed");
539     }
540 
541     // Assign the pointers to the core data structure
542     m_RpsInfo->lookup_header = NULL;
543     m_RpsInfo->profile_header = NULL;
544     m_RpsInfo->freq_header = NULL;
545     m_RpsInfo->obsr_header = NULL;
546     m_RpsInfo->freq_ratios_header = NULL;
547 
548         // Load the various files
549     if (flags & fAuxInfoFile) {
550         m_AuxFile.Reset(new CRpsAuxFile(path));
551 
552         // Note that these const_casts are only needed because the data structure
553         // doesn't take const pointers, but these won't be modified at all
554         m_RpsInfo->aux_info =
555             *const_cast<BlastRPSAuxInfo*>((*m_AuxFile)());
556     }
557 
558     if (flags & fLookupTableFile) {
559         m_LutFile.Reset(new CRpsLookupTblFile(path));
560 
561         // Note that these const_casts are only needed because the data structure
562         // doesn't take const pointers, but these won't be modified at all
563         m_RpsInfo->lookup_header =
564             const_cast<BlastRPSLookupFileHeader*>((*m_LutFile)());
565     }
566 
567     if (flags & fPssmFile) {
568         m_PssmFile.Reset(new CRpsPssmFile(path));
569 
570         // Note that these const_casts are only needed because the data structure
571         // doesn't take const pointers, but these won't be modified at all
572         m_RpsInfo->profile_header =
573             const_cast<BlastRPSProfileHeader*>((*m_PssmFile)());
574     }
575 
576     if (flags & fFrequenciesFile) {
577         m_FreqsFile.Reset(new CRpsFreqsFile(path));
578 
579         // Note that these const_casts are only needed because the data structure
580         // doesn't take const pointers, but these won't be modified at all
581         m_RpsInfo->freq_header =
582             const_cast<BlastRPSProfileHeader*>((*m_FreqsFile)());
583     }
584 
585     if (flags & fObservationsFile) {
586         m_ObsrFile.Reset(new CRpsObsrFile(path));
587 
588         // Note that these const_casts are only needed because the data structure
589         // doesn't take const pointers, but these won't be modified at all
590         m_RpsInfo->obsr_header =
591             const_cast<BlastRPSProfileHeader*>((*m_ObsrFile)());
592     }
593 
594     if (flags & fFreqRatiosFile) {
595         try {
596             // read frequency ratios data
597             m_FreqRatiosFile.Reset(new CRpsFreqRatiosFile(path));
598         } catch (const CBlastException& e) {
599         	string msg = rps_dbname + " contains no frequency ratios needed for composition-based statistics.\n" \
600         			     "Please disable composition-based statistics when searching against " + rps_dbname + ".";
601             NCBI_RETHROW(e, CBlastException, eRpsInit, msg);
602         }
603         m_RpsInfo->freq_ratios_header =
604             const_cast<BlastRPSFreqRatiosHeader*>((*m_FreqRatiosFile)());
605     }
606 }
607 
608 // Trivial at this point, but left out-of-line so that the header doesn't
609 // need to pull in full declarations of the classes to which it takes CRefs.
~CBlastRPSInfo()610 CBlastRPSInfo::~CBlastRPSInfo()
611 {
612 }
613 
614 const BlastRPSInfo*
operator ()() const615 CBlastRPSInfo::operator()() const
616 {
617     return m_RpsInfo.get();
618 }
619 
620 double
GetScalingFactor() const621 CBlastRPSInfo::GetScalingFactor() const
622 {
623     return m_RpsInfo->aux_info.scale_factor;
624 }
625 
626 const char*
GetMatrixName() const627 CBlastRPSInfo::GetMatrixName() const
628 {
629     return m_RpsInfo->aux_info.orig_score_matrix;
630 }
631 
632 int
GetGapOpeningCost() const633 CBlastRPSInfo::GetGapOpeningCost() const
634 {
635     return m_RpsInfo->aux_info.gap_open_penalty;
636 }
637 
638 int
GetGapExtensionCost() const639 CBlastRPSInfo::GetGapExtensionCost() const
640 {
641     return m_RpsInfo->aux_info.gap_extend_penalty;
642 }
643 
644 END_SCOPE(blast)
645 END_NCBI_SCOPE
646 
647 /* @} */
648