1 /*  $Id: cn3d_cache.cpp 620017 2020-11-13 19:16:01Z hurwitz $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors:  Paul Thiessen
27 *
28 * File Description:
29 *      implements a basic cache for structures
30 *
31 * ===========================================================================
32 */
33 
34 #include <ncbi_pch.hpp>
35 #include <corelib/ncbistd.hpp>
36 
37 #include <objects/ncbimime/Ncbi_mime_asn1.hpp>
38 #include <objects/ncbimime/Biostruc_seq.hpp>
39 #include <objects/seqset/Seq_entry.hpp>
40 #include <objects/seqset/Bioseq_set.hpp>
41 #include <objects/mmdb1/Biostruc_id.hpp>
42 #include <objects/mmdb1/Mmdb_id.hpp>
43 
44 #include "remove_header_conflicts.hpp"
45 
46 // for file/directory manipulation stuff
47 #ifdef __WXMSW__
48 #include <windows.h>
49 #include <wx/msw/winundef.h>
50 #endif
51 #include <wx/wx.h>
52 #include <wx/datetime.h>
53 #include <wx/file.h>
54 #include <wx/filename.h>
55 
56 #include "cn3d_cache.hpp"
57 #include "cn3d_tools.hpp"
58 #include "asn_reader.hpp"
59 
60 USING_NCBI_SCOPE;
61 USING_SCOPE(objects);
62 
63 
BEGIN_SCOPE(Cn3D)64 BEGIN_SCOPE(Cn3D)
65 
66 static string GetCacheFilePath(int mmdbID, EModel_type modelType)
67 {
68     string cachePath;
69     if (RegistryGetString(REG_CACHE_SECTION, REG_CACHE_FOLDER, &cachePath)) {
70         wxString cacheFile;
71         cacheFile.Printf("%s%c%i.%i", cachePath.c_str(), wxFILE_SEP_PATH, mmdbID, modelType);
72         cachePath = cacheFile.c_str();
73     } else
74         ERRORMSG("Can't get cache folder from registry");
75     return cachePath;
76 }
77 
CreateCacheFolder(void)78 static bool CreateCacheFolder(void)
79 {
80     string cacheFolder;
81     if (!RegistryGetString(REG_CACHE_SECTION, REG_CACHE_FOLDER, &cacheFolder)) return false;
82     if (wxDirExists(cacheFolder.c_str())) return true;
83     bool okay = wxMkdir(cacheFolder.c_str());
84     TRACEMSG((okay ? "created" : "failed to create") << " folder " << cacheFolder);
85     return okay;
86 }
87 
ExtractBioseqs(list<CRef<CSeq_entry>> & seqEntries,BioseqRefList * sequences)88 static void ExtractBioseqs(list < CRef < CSeq_entry > >& seqEntries, BioseqRefList *sequences)
89 {
90     list < CRef < CSeq_entry > >::iterator e, ee = seqEntries.end();
91     for (e=seqEntries.begin(); e!=ee; ++e) {
92         if ((*e)->IsSeq())
93             sequences->push_back(CRef<CBioseq>(&((*e)->SetSeq())));
94         else
95             ExtractBioseqs((*e)->SetSet().SetSeq_set(), sequences);
96     }
97 }
98 
ExtractBiostrucAndBioseqs(CNcbi_mime_asn1 & mime,CRef<CBiostruc> & biostruc,BioseqRefList * sequences)99 bool ExtractBiostrucAndBioseqs(CNcbi_mime_asn1& mime,
100     CRef < CBiostruc >& biostruc, BioseqRefList *sequences)
101 {
102     if (!mime.IsStrucseq()) {
103         ERRORMSG("ExtractBiostrucAndBioseqs() - expecting strucseq mime");
104         return false;
105     }
106 
107     // copy mime's biostruc into existing object
108     biostruc.Reset(&(mime.SetStrucseq().SetStructure()));
109 
110     // extract Bioseqs
111     if (sequences) {
112         sequences->clear();
113         ExtractBioseqs(mime.SetStrucseq().SetSequences(), sequences);
114     }
115 
116     return true;
117 }
118 
GetStructureFromCacheFolder(int mmdbID,EModel_type modelType)119 static CNcbi_mime_asn1 * GetStructureFromCacheFolder(int mmdbID, EModel_type modelType)
120 {
121     // try to load from cache
122     INFOMSG("looking for " << mmdbID << " (model type " << (int) modelType << ") in cache:");
123     string err, cacheFile = GetCacheFilePath(mmdbID, modelType);
124     CRef < CNcbi_mime_asn1 > mime(new CNcbi_mime_asn1());
125     SetDiagPostLevel(eDiag_Fatal); // ignore all but Fatal errors while reading data
126     bool gotFile = ReadASNFromFile(cacheFile.c_str(), mime.GetPointer(), true, &err);
127     SetDiagPostLevel(eDiag_Info);
128     if (!gotFile) {
129         WARNINGMSG("failed to load " << mmdbID
130             << " (model type " << (int) modelType << ") from cache: " << err);
131         return NULL;
132     }
133 
134     // if successful, 'touch' the file to mark it as recently used
135     INFOMSG("loaded " << cacheFile);
136     wxFileName fn(cacheFile.c_str());
137     if (!fn.Touch())
138         WARNINGMSG("error touching " << cacheFile);
139 
140     return mime.Release();
141 }
142 
143 //  If assemblyId = -1, use the predefined 'default' assembly.
144 //  Otherwise, get the specific assembly requested, where
145 //  assemblyId = 0 means the ASU, and PDB-defined assemblies
146 //  are indexed sequentially from 1.
GetStructureViaHTTPAndAddToCache(const string & uid,int mmdbID,EModel_type modelType,int assemblyId=0)147 static CNcbi_mime_asn1 * GetStructureViaHTTPAndAddToCache(
148     const string& uid, int mmdbID, EModel_type modelType, int assemblyId = 0)
149 {
150     string host, path, args;
151 
152     if (assemblyId == 0)  {
153         // construct URL [mmdbsrv.cgi]
154 
155         // this is for a test release for Gabi for testing long pdb chain ids. Dave 10/19/20.
156         // this is from Dachuan, showing what the test URL looks like, and an example.
157         // https://dev.ncbi.nlm.nih.gov/Structure/pdbtest/[mmdb|cdd|vast|vastplus|wrbsp]/[*].cgi
158         // https://dev.ncbi.nlm.nih.gov/Structure/pdbtest/mmdb/mmdbsrv.cgi
159         host = "dev.ncbi.nlm.nih.gov";                                                   // *
160         path = "/Structure/pdbtest/mmdb/mmdbsrv.cgi";                                    // *
161 
162         // this is the original, prior to making the test release for Gabi.
163         // host = "www.ncbi.nlm.nih.gov";
164         // path = "/Structure/mmdb/mmdbsrv.cgi";
165 
166         args = "save=Save&dopt=j&uid=";
167         if (mmdbID > 0)
168             args += NStr::IntToString(mmdbID);
169         else    // assume PDB id
170             args += uid;
171         args += "&Complexity=";
172         switch (modelType) {
173             case eModel_type_ncbi_all_atom: args += "3"; break;
174             case eModel_type_pdb_model: args += "4"; break;
175             case eModel_type_ncbi_backbone:
176             default:
177                 args += "2"; break;
178         }
179     }
180 
181     else {
182         // construct URL [mmdb_strview.cgi]
183         host = "www.ncbi.nlm.nih.gov";
184         path = "/Structure/mmdb/mmdb_strview.cgi";
185         args = "program=cn3d&display=1&uid=";
186         if (mmdbID > 0)
187             args += NStr::IntToString(mmdbID);
188         else    // assume PDB id
189             args += uid;
190         args += "&complexity=";
191         switch (modelType) {
192             case eModel_type_ncbi_vector: args += "1"; break;
193             case eModel_type_ncbi_all_atom: args += "3"; break;
194             case eModel_type_pdb_model: args += "4"; break;
195             case eModel_type_ncbi_backbone:
196             default:
197                 args += "2"; break;
198         }
199         args += "&buidx=" + NStr::IntToString(assemblyId);
200     }
201 
202     // load from network
203     INFOMSG("Trying to load structure data from " << host << path << '?' << args);
204     string err;
205     CRef < CNcbi_mime_asn1 > mime(new CNcbi_mime_asn1());
206 
207     if (!GetAsnDataViaHTTPS(host, path, args, mime.GetPointer(), &err) ||
208             !mime->IsStrucseq()) {
209         ERRORMSG("Failed to read structure " << uid << " from network\nreason: " << err);
210         return NULL;
211 
212     } else {
213         // get MMDB ID from biostruc if not already known
214         if (mmdbID == 0) {
215             if (mime->GetStrucseq().GetStructure().GetId().front()->IsMmdb_id())
216                 mmdbID = mime->GetStrucseq().GetStructure().GetId().front()->GetMmdb_id().Get();
217             else {
218                 ERRORMSG("Can't get MMDB ID from Biostruc!");
219                 return mime.Release();
220             }
221         }
222 
223         bool cacheEnabled;
224         if (RegistryGetBoolean(REG_CACHE_SECTION, REG_CACHE_ENABLED, &cacheEnabled) && cacheEnabled) {
225             // add to cache
226             if (CreateCacheFolder() &&
227                 WriteASNToFile(GetCacheFilePath(mmdbID, modelType).c_str(), *mime, true, &err)) {
228                 INFOMSG("stored " << mmdbID << " (model type " << (int) modelType << ") in cache");
229                 // trim cache to appropriate size if we've added a new file
230                 int size;
231                 if (RegistryGetInteger(REG_CACHE_SECTION, REG_CACHE_MAX_SIZE, &size))
232                     TruncateCache(size);
233             } else {
234                 WARNINGMSG("Failed to write structure to cache folder");
235                 if (err.size() > 0) WARNINGMSG("reason: " << err);
236             }
237         }
238     }
239 
240     return mime.Release();
241 }
242 
LoadStructureViaCache(const std::string & uid,ncbi::objects::EModel_type modelType,int assemblyId)243 CNcbi_mime_asn1 * LoadStructureViaCache(const std::string& uid, ncbi::objects::EModel_type modelType, int assemblyId)
244 {
245     // determine whether this is an integer MMDB ID or alphanumeric PDB ID
246     int mmdbID = 0;
247     if (uid.size() == 4 && (isalpha((unsigned char) uid[1]) || isalpha((unsigned char) uid[2]) || isalpha((unsigned char) uid[3]))) {
248         TRACEMSG("Fetching PDB " << uid);
249     } else {    // mmdb id
250         unsigned long tmp;
251         if (wxString(uid.c_str()).ToULong(&tmp)) {
252             mmdbID = (int) tmp;
253         } else {
254             ERRORMSG("LoadStructureViaCache() - invalid uid " << uid);
255             return NULL;
256         }
257         TRACEMSG("Fetching MMDB " << mmdbID);
258     }
259 
260     // try loading from local cache folder first, if cache enabled in registry (but only with known mmdb id)
261     bool cacheEnabled;
262     CNcbi_mime_asn1 *mime = NULL;
263     if (mmdbID > 0 &&
264             RegistryGetBoolean(REG_CACHE_SECTION, REG_CACHE_ENABLED, &cacheEnabled) &&
265             cacheEnabled)
266         mime = GetStructureFromCacheFolder(mmdbID, modelType);
267 
268     // otherwise, load via HTTP (and save in cache folder)
269     if (!mime)
270         mime = GetStructureViaHTTPAndAddToCache(uid, mmdbID, modelType, assemblyId);
271 
272     return mime;
273 }
274 
LoadStructureViaCache(const std::string & uid,ncbi::objects::EModel_type modelType,int assemblyId,CRef<CBiostruc> & biostruc,BioseqRefList * sequences)275 bool LoadStructureViaCache(const std::string& uid, ncbi::objects::EModel_type modelType, int assemblyId,
276     CRef < CBiostruc >& biostruc, BioseqRefList *sequences)
277 {
278     CRef < CNcbi_mime_asn1 > mime(LoadStructureViaCache(uid, modelType, assemblyId));
279 
280     // debugging
281     // string errStr;
282     // WriteASNToFile("mime_data.txt", mime.GetObject(), false, &errStr);
283 
284     return (mime.NotEmpty() && ExtractBiostrucAndBioseqs(*mime, biostruc, sequences));
285 }
286 
TruncateCache(unsigned int maxSize)287 void TruncateCache(unsigned int maxSize)
288 {
289     string cacheFolder;
290     if (!RegistryGetString(REG_CACHE_SECTION, REG_CACHE_FOLDER, &cacheFolder) ||
291         !wxDirExists(cacheFolder.c_str())) {
292         WARNINGMSG("can't find cache folder");
293         return;
294     }
295     INFOMSG("truncating cache to " << maxSize << " MB");
296 
297     wxString cacheFolderFiles;
298     cacheFolderFiles.Printf("%s%c*", cacheFolder.c_str(), wxFILE_SEP_PATH);
299 
300     // empty directory if maxSize <= 0
301     if (maxSize <= 0) {
302         wxString f;
303         while ((f=wxFindFirstFile(cacheFolderFiles, wxFILE)).size() > 0) {
304             if (!wxRemoveFile(f))
305                 WARNINGMSG("can't remove file " << f);
306         }
307         return;
308     }
309 
310     // otherwise, add up file sizes and keep deleting oldest until total size <= max
311     unsigned long totalSize = 0;
312     wxString oldestFileName;
313     do {
314 
315         // if totalSize > 0, then we've already scanned the folder and know it's too big,
316         // so delete oldest file
317         if (totalSize > 0 && !wxRemoveFile(oldestFileName))
318             WARNINGMSG("can't remove file " << oldestFileName);
319 
320         // loop through files, finding oldest and calculating total size
321         totalSize = 0;
322         time_t oldestFileDate = wxDateTime::GetTimeNow(), date;
323         wxString file = wxFindFirstFile(cacheFolderFiles, wxFILE);
324         for (; file.size() > 0; file = wxFindNextFile()) {
325             date = wxFileModificationTime(file);
326             if (date < oldestFileDate) {
327                 oldestFileDate = date;
328                 oldestFileName = file;
329             }
330             wxFile wx_file(file, wxFile::read);
331             if (wx_file.IsOpened()) {
332                 totalSize += wx_file.Length();
333                 wx_file.Close();
334             } else
335                 WARNINGMSG("wxFile failed to open " << file);
336         }
337         INFOMSG("total size: " << totalSize << " oldest file: " << oldestFileName.c_str());
338 
339     } while (totalSize > maxSize * 1024 * 1024);
340 }
341 
342 END_SCOPE(Cn3D)
343