1 /*  $Id: asn_cache_test.cpp 619504 2020-11-05 18:25:12Z badrazat $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Mike Diccucio Cheinan Marks
27  *
28  * File Description:
29  * Test reading from the ID ASN.1 Cache.
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbienv.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <objects/seqset/Seq_entry.hpp>
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seq/Bioseq.hpp>
41 
42 #include <serial/serial.hpp>
43 #include <serial/objostr.hpp>
44 #include <serial/iterator.hpp>
45 
46 #include <objtools/data_loaders/asn_cache/asn_cache.hpp>
47 #include <objtools/data_loaders/asn_cache/asn_cache_util.hpp>
48 #include <objtools/data_loaders/asn_cache/asn_cache_loader.hpp>
49 
50 #include <objmgr/object_manager.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/bioseq_handle.hpp>
53 #include <objmgr/impl/tse_info.hpp>
54 #include <objmgr/impl/data_source.hpp>
55 
56 #include <objects/seq/Delta_ext.hpp>
57 #include <objects/seq/Delta_seq.hpp>
58 #include <objects/seq/Seq_inst.hpp>
59 #include <objects/seq/Seq_ext.hpp>
60 #include <objects/seq/seq_id_handle.hpp>
61 #include <objects/seqfeat/Gb_qual.hpp>
62 
63 
64 
65 USING_NCBI_SCOPE;
66 USING_SCOPE(objects);
67 
s_GetSeqIds(const vector<CSeq_id_Handle> & handles)68 static const CBioseq::TId &s_GetSeqIds(const vector<CSeq_id_Handle> &handles){
69     static CBioseq::TId ids;
70     ids.clear();
71     ITERATE(vector<CSeq_id_Handle>, it, handles)
72         ids.push_back(CRef<CSeq_id>(const_cast<CSeq_id*>(it->GetSeqId().GetNonNullPointer())));
73     return ids;
74 }
75 
s_SameIds(const CBioseq::TId & ids1,const CBioseq::TId & ids2)76 static bool s_SameIds(const CBioseq::TId &ids1, const CBioseq::TId &ids2)
77 {
78     if(ids1.size() != ids2.size())
79         return false;
80     ITERATE(CBioseq::TId, it1, ids1){
81         CRef<CSeq_id> matching_id;
82         ITERATE(CBioseq::TId, it2, ids2)
83             if((*it2)->Match(**it1)){
84                 matching_id = *it2;
85                 break;
86             }
87         if(!matching_id)
88             return false;
89     }
90     return true;
91 }
92 
93 /////////////////////////////////////////////////////////////////////////////
94 //  CAsnCacheTestApplication::
95 
96 
97 class CAsnCacheTestApplication : public CNcbiApplication
98 {
99 private:
100     virtual void Init(void);
101     virtual int  Run(void);
102     virtual void Exit(void);
103 
104 private: //member functions
105 
106     bool x_FindAnnotated(const CSeq_entry& entry);
107 
108 
109 };
110 
111 
112 /////////////////////////////////////////////////////////////////////////////
113 //  Init test for all different types of arguments
114 
115 
Init(void)116 void CAsnCacheTestApplication::Init(void)
117 {
118     // Create command-line argument descriptions class
119     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
120 
121     // Specify USAGE context
122     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
123                               "CArgDescriptions demo program");
124 
125     arg_desc->AddKey("cache", "ASNCache",
126                      "Path to ASN.1 cache",
127                      CArgDescriptions::eInputFile);
128 
129     arg_desc->AddDefaultKey("i", "AccessionList",
130                             "List of accessions to retrieve",
131                             CArgDescriptions::eInputFile,
132                             "-");
133 
134     arg_desc->AddDefaultKey("o", "OutputFile",
135                             "File to place ASN seq-entries in",
136                             CArgDescriptions::eOutputFile,
137                             "-");
138 
139     arg_desc->AddFlag("test-loader",  "Test use of the ASN cache data loader");
140 
141     arg_desc->AddFlag("raw",  "Test raw retrieval only");
142     arg_desc->AddFlag("text", "Use ASN.1 text output");
143     arg_desc->AddFlag("find-annotated", "Find annotated accessions");
144     arg_desc->AddFlag("dump-GP-7574", "Dump comments and seq-descs");
145     arg_desc->AddFlag("dump-GP-8763", "Dump qualifiers");
146     arg_desc->AddFlag("dump-proteins", "Dump protein seqs");
147     arg_desc->AddFlag("no-serialize", "Do not reserialize the ASN.1");
148     arg_desc->AddFlag("random-order", "Retrieve sequences in random order");
149     arg_desc->AddFlag("test-warm", "Retrieve sequences twice, to check differences between cold-cache and warm-cache times");
150 
151     arg_desc->AddFlag("indexonly", "Print the index entry only, do not fetch the blob." );
152 
153     arg_desc->AddFlag("idonly", "Verify that ID information in the cache is available and accurate for the listed accessions." );
154     arg_desc->AddFlag("verify-ids", "Verify that ID information in the cache is available and accurate for the listed accessions." );
155     arg_desc->SetDependency("raw", CArgDescriptions::eExcludes, "indexonly");
156     arg_desc->SetDependency("verify-ids", CArgDescriptions::eExcludes, "raw");
157     arg_desc->SetDependency("verify-ids", CArgDescriptions::eExcludes, "indexonly");
158     arg_desc->SetDependency("test-loader", CArgDescriptions::eExcludes, "raw");
159     arg_desc->SetDependency("test-loader", CArgDescriptions::eExcludes, "indexonly");
160     arg_desc->SetDependency("idonly", CArgDescriptions::eExcludes, "raw");
161     arg_desc->SetDependency("idonly", CArgDescriptions::eExcludes, "indexonly");
162     arg_desc->SetDependency("idonly", CArgDescriptions::eExcludes, "verify-ids");
163     arg_desc->SetDependency("no-serialize", CArgDescriptions::eExcludes, "indexonly");
164     arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "raw");
165     arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "indexonly");
166     arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "idonly");
167     arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "no-serialize");
168 
169     arg_desc->AddFlag("get-multiple",
170                       "If several entries match the specified id, get all of "
171                       "them, not only latest one");
172     arg_desc->SetDependency("get-multiple", CArgDescriptions::eExcludes, "idonly");
173     arg_desc->SetDependency("get-multiple", CArgDescriptions::eExcludes, "test-loader");
174 
175     // Setup arg.descriptions for this application
176     SetupArgDescriptions(arg_desc.release());
177 }
178 
179 
180 
Run(void)181 int CAsnCacheTestApplication::Run(void)
182 {
183     // Get arguments
184     const CArgs& args = GetArgs();
185     CNcbiOstream& ostr = args["o"].AsOutputFile();
186     CNcbiIstream& istr = args["i"].AsInputFile();
187 
188     bool readIndexOnly = args[ "indexonly" ];
189     bool raw = args["raw"];
190     bool serialize = !args["no-serialize"];
191     bool verify_ids = args["verify-ids"];
192     bool getIdOnly = args["idonly"];
193     bool multiple = args["get-multiple"];
194 
195     vector< CConstRef<CSeq_entry> > entries;
196     vector< CDataLoader::TIds > id_sets;
197     vector< CSeq_id_Handle > ids;
198 
199     auto_ptr<CObjectOStream> os;
200 
201     string line;
202     while (NcbiGetlineEOL(istr, line)) {
203         if (line.empty()  ||  line[0] == '#') {
204             continue;
205         }
206         try {
207             CSeq_id id(line);
208             ids.push_back(CSeq_id_Handle::GetHandle(id));
209         }
210         catch (CException& e) {
211             LOG_POST(Error << "failed to convert "
212                      << line << " to a SeqId: " << e.what());
213         }
214     }
215 
216     if (serialize) {
217         if(getIdOnly) {
218             id_sets.reserve(ids.size());
219         }
220         else {
221             entries.reserve(ids.size());
222         }
223     }
224 
225 
226     CRef<CDataLoader> loader;
227     CRef<CAsnCache> cache;
228     CRef<CDataSource> source;
229     CRef<CScope> m_LocalCacheScope;
230 
231     if(args["test-loader"]){
232         loader.Reset(CAsnCache_DataLoader::TDbMaker(args["cache"].AsString()).CreateLoader());
233         source.Reset(new CDataSource(*loader));
234     } else
235         cache.Reset(new CAsnCache(args["cache"].AsString()));
236     if(args["dump-proteins"]){
237         m_LocalCacheScope.Reset(new CScope (*CObjectManager::GetInstance()));
238         string cache_path = args["cache"].AsString();
239         CAsnCache_DataLoader::RegisterInObjectManager(*CObjectManager::GetInstance(), cache_path, CObjectManager::eDefault, 1 );
240         string loader_name = CAsnCache_DataLoader::GetLoaderNameFromArgs(cache_path);
241         if ( CObjectManager::GetInstance()->FindDataLoader(loader_name ) ) {
242             loader_name = CObjectManager::GetInstance()->FindDataLoader(loader_name )->GetName()  ;
243         } else {
244         }
245         m_LocalCacheScope->AddDataLoader(loader_name );
246     }
247 
248 
249     int num_cycles = args["test-warm"] ? 2 : 1;
250     size_t count_failed = 0;
251 
252     for(int cycle = 0; cycle < num_cycles; cycle++){
253         if (args["random-order"]) {
254             random_shuffle(ids.begin(), ids.end());
255         }
256 
257         size_t count = 0;
258         CStopWatch sw;
259         sw.Start();
260 
261         ITERATE(vector<CSeq_id_Handle>, id_it, ids){
262             try {
263                 if ( readIndexOnly ) {
264                     vector<CAsnIndex::SIndexInfo> info;
265 
266                     if (multiple) {
267                         cache->GetMultipleIndexEntries( *id_it, info);
268                     } else {
269                         info.resize(1);
270                         cache->GetIndexEntry( *id_it, info[0]);
271                     }
272 
273                     ITERATE (vector<CAsnIndex::SIndexInfo>, info_it, info) {
274                         ostr << *info_it << endl;
275                     }
276                 } else if (raw) {
277                     vector<CAsnCache::TBuffer> buffer(multiple ? 0 : 1);
278                     bool success = multiple ? cache->GetMultipleRaw(*id_it, buffer)
279                                             : cache->GetRaw(*id_it, buffer[0]);
280                     if (success) {
281                         if(serialize && cycle == 0) {
282                             ITERATE (vector<CAsnCache::TBuffer>, buf_it, buffer) {
283                                 ostr.write((const char*)&(*buf_it)[0],
284                                            buf_it->size());
285                             }
286                         }
287                     } else {
288                         LOG_POST(Error << "failed to retrieve: "
289                                  << id_it->GetSeqId()->AsFastaString());
290                         ++count_failed;
291                     }
292                 } else if (getIdOnly) {
293                     CDataLoader::TIds id_set;
294                     if(loader)
295                         loader->GetIds(*id_it, id_set);
296                     else
297                         cache->GetSeqIds(*id_it, id_set);
298                     if(id_set.empty()) {
299                         LOG_POST(Error << "failed to retrieve: "
300                                  << id_it->GetSeqId()->AsFastaString());
301                         ++count_failed;
302                     } else if(serialize && cycle==0)
303                         id_sets.push_back(id_set);
304                 } else {
305                   vector< CRef<CSeq_entry> > entries_for_id;
306                   if (multiple) {
307                     entries_for_id = cache->GetMultipleEntries(*id_it);
308                   } else if (loader) {
309                     entries_for_id.push_back(CRef<CSeq_entry>(const_cast<CSeq_entry *>
310                         ((*loader->GetRecords(*id_it, CDataLoader::eBioseq).begin())->GetCompleteTSE().GetPointer())));
311                   } else {
312                     entries_for_id.push_back(cache->GetEntry(*id_it));
313                   }
314                   ITERATE (vector< CRef<CSeq_entry> >, entry_it, entries_for_id) {
315                     CRef<CSeq_entry> entry = *entry_it;
316                     if(args["find-annotated"]) {
317                         bool is_annotated = x_FindAnnotated(*entry);
318                         CDataLoader::TIds id_set;
319                         if(loader) {
320                             loader->GetIds(*id_it, id_set);
321                         }
322                         else {
323                             cache->GetSeqIds(*id_it, id_set);
324                         }
325                             cout<<"is_annotated" << "\t"
326                                 << "original" << "\t"
327                                 << *id_it << "\t"
328                                 << boolalpha << is_annotated << endl;
329                         ITERATE(CDataLoader::TIds, id_it, id_set) {
330                             cout<<"is_annotated" << "\t"
331                                 << "alias" << "\t"
332                                 << *id_it << "\t"
333                                 << boolalpha << is_annotated << endl;
334                         }
335                     }
336                     if(entry && args["dump-GP-7574"]) {
337                         cerr << *id_it << endl;
338                         for(CTypeConstIterator<CSeqdesc> desc(*entry); desc; ++desc) {
339                             switch ( desc->Which() ) {
340                                 case CSeqdesc::e_User:
341                                 case CSeqdesc::e_Comment:
342                                     cout << MSerial_AsnText << *desc;
343                                     break;
344                                 default: break;
345                             }
346                         }
347                     }
348                     if(entry && args["dump-GP-8763"]) {
349                         cerr << *id_it << endl; bool first=true;
350                         for(CTypeConstIterator<CGb_qual> desc(*entry); desc; ++desc) {
351                             if(desc->GetQual() == "inference" ||
352                                desc->GetQual() == "experiment"
353                             ) {
354                                 if(first) { cout << *id_it << endl;  } first=false;
355                                 cout << MSerial_AsnText << *desc;
356                             }
357                         }
358                     }
359 
360 
361                     if(entry && args["dump-proteins"]) {
362                         for(CTypeConstIterator<CSeq_feat> feat(*entry); feat; ++feat) {
363                             if( feat->GetData().Which() == CSeqFeatData::e_Cdregion &&
364                                 feat->IsSetProduct() ) {
365                                 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*feat->GetProduct().GetId());
366                                 CBioseq_Handle bsh = m_LocalCacheScope->GetBioseqHandle(idh);
367                                 if( bsh.CanGetInst() ) {
368                                     if (
369                                         bsh.GetInst().IsSetExt() &&
370                                         bsh.GetInst().GetExt().IsDelta() ) {
371                                         ITERATE(CDelta_ext::Tdata, delta, bsh.GetInst().GetExt().GetDelta().Get() ) {
372                                             if( (*delta)->IsLoc() ) {
373                                                 idh = CSeq_id_Handle::GetHandle( *(*delta)->GetLoc().GetId() );
374                                                 break;
375                                             }
376                                         }
377                                     }
378                                 } else  {
379                                     LOG_POST(Warning<<"Can't get seqinst for " << idh);
380                                 }
381                                 cout << *id_it << "\t"
382                                     << idh << endl;
383                                 //auto_ptr<CBestPigMapper> m_mapper
384                             }
385                         }
386                     }
387                     if (entry) {
388                         if (serialize && cycle==0) {
389                             if( ! args["find-annotated"])
390                                 entries.push_back(entry);
391                         }
392                         if(verify_ids){
393                             CDataLoader::TIds retrieved_ids;
394                             if(loader)
395                                 loader->GetIds(*id_it, retrieved_ids);
396                             else
397                                 cache->GetSeqIds(*id_it, retrieved_ids);
398                             if(retrieved_ids.empty()){
399                                 LOG_POST(Error << "failed to retrieve ids for: "
400                                          << id_it->GetSeqId()->AsFastaString());
401                                 ++count_failed;
402                             } else if(!s_SameIds(s_GetSeqIds(retrieved_ids), ExtractBioseq(entry, *id_it)->GetId()))
403                             {
404                                 LOG_POST(Error << "Mismatched ids for: "
405                                          << id_it->GetSeqId()->AsFastaString() << ": retrieved:");
406                                 ITERATE(vector<CSeq_id_Handle>, it, retrieved_ids)
407                                     LOG_POST(Error << it->GetSeqId()->AsFastaString());
408                                 LOG_POST(Error << "IDs in entry:");
409                                 ITERATE(CBioseq::TId, it, ExtractBioseq(entry, *id_it)->GetId())
410                                     LOG_POST(Error << (*it)->AsFastaString());
411                                 ++count_failed;
412                             } else
413                                 LOG_POST(Info << "Succesfully retrieved " << retrieved_ids.size()
414                                               << " ids for " << id_it->GetSeqId()->AsFastaString());
415                         }
416                     } else {
417                         LOG_POST(Error << "failed to retrieve: "
418                                  << id_it->GetSeqId()->AsFastaString());
419                         ++count_failed;
420                     }
421                   }
422                 }
423 
424                 ++count;
425             }
426             catch (CException& e) {
427                 LOG_POST(Error << "failed to retrieve "
428                          << line << ": " << e.what());
429             }
430         }
431 
432         double e = sw.Elapsed();
433         LOG_POST(Error << "done cycle " << cycle+1 << ", " << count << " seqs / " << e << " seconds = "
434                  << count / e << " seqs/sec ("
435                  << count_failed << " failed to retrieve)");
436     }
437     if(args["dump-proteins"]) { return count_failed; }
438     if(serialize) {
439         if(getIdOnly){
440             ITERATE(vector< CDataLoader::TIds >, it, id_sets){
441                 ITERATE(CDataLoader::TIds, id_it, *it){
442                     if(id_it != it->begin())
443                         ostr << ", ";
444                     ostr << id_it->GetSeqId()->AsFastaString();
445                 }
446                 ostr << endl;
447             }
448         } else {
449             if (args["text"]) {
450                 os.reset(CObjectOStream::Open(eSerial_AsnText, ostr));
451             } else {
452                 os.reset(CObjectOStream::Open(eSerial_AsnBinary, ostr));
453             }
454             ITERATE(vector< CConstRef<CSeq_entry> >, it, entries)
455                 *os << **it;
456         }
457     }
458 
459     return count_failed;
460 }
461 
x_FindAnnotated(const CSeq_entry & entry)462 bool CAsnCacheTestApplication::x_FindAnnotated(const CSeq_entry& entry)
463 {
464     return
465        ( entry.IsSeq() && entry.GetSeq().IsSetAnnot() ) ||
466        ( entry.IsSet() && entry.GetSet().IsSetAnnot() );
467 
468 }
469 
470 /////////////////////////////////////////////////////////////////////////////
471 //  Cleanup
472 
473 
Exit(void)474 void CAsnCacheTestApplication::Exit(void)
475 {
476     SetDiagStream(0);
477 }
478 
479 
480 /////////////////////////////////////////////////////////////////////////////
481 //  MAIN
482 
483 
main(int argc,const char * argv[])484 int main(int argc, const char* argv[])
485 {
486     // Execute main application function
487     return CAsnCacheTestApplication().AppMain(argc, argv);
488 }
489