1 /* $Id: asn_cache_test.cpp 619504 2020-11-05 18:25:12Z badrazat $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Mike Diccucio Cheinan Marks
27 *
28 * File Description:
29 * Test reading from the ID ASN.1 Cache.
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbienv.hpp>
36 #include <corelib/ncbiargs.hpp>
37 #include <corelib/ncbifile.hpp>
38 #include <objects/seqset/Seq_entry.hpp>
39 #include <objects/seqloc/Seq_id.hpp>
40 #include <objects/seq/Bioseq.hpp>
41
42 #include <serial/serial.hpp>
43 #include <serial/objostr.hpp>
44 #include <serial/iterator.hpp>
45
46 #include <objtools/data_loaders/asn_cache/asn_cache.hpp>
47 #include <objtools/data_loaders/asn_cache/asn_cache_util.hpp>
48 #include <objtools/data_loaders/asn_cache/asn_cache_loader.hpp>
49
50 #include <objmgr/object_manager.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/bioseq_handle.hpp>
53 #include <objmgr/impl/tse_info.hpp>
54 #include <objmgr/impl/data_source.hpp>
55
56 #include <objects/seq/Delta_ext.hpp>
57 #include <objects/seq/Delta_seq.hpp>
58 #include <objects/seq/Seq_inst.hpp>
59 #include <objects/seq/Seq_ext.hpp>
60 #include <objects/seq/seq_id_handle.hpp>
61 #include <objects/seqfeat/Gb_qual.hpp>
62
63
64
65 USING_NCBI_SCOPE;
66 USING_SCOPE(objects);
67
s_GetSeqIds(const vector<CSeq_id_Handle> & handles)68 static const CBioseq::TId &s_GetSeqIds(const vector<CSeq_id_Handle> &handles){
69 static CBioseq::TId ids;
70 ids.clear();
71 ITERATE(vector<CSeq_id_Handle>, it, handles)
72 ids.push_back(CRef<CSeq_id>(const_cast<CSeq_id*>(it->GetSeqId().GetNonNullPointer())));
73 return ids;
74 }
75
s_SameIds(const CBioseq::TId & ids1,const CBioseq::TId & ids2)76 static bool s_SameIds(const CBioseq::TId &ids1, const CBioseq::TId &ids2)
77 {
78 if(ids1.size() != ids2.size())
79 return false;
80 ITERATE(CBioseq::TId, it1, ids1){
81 CRef<CSeq_id> matching_id;
82 ITERATE(CBioseq::TId, it2, ids2)
83 if((*it2)->Match(**it1)){
84 matching_id = *it2;
85 break;
86 }
87 if(!matching_id)
88 return false;
89 }
90 return true;
91 }
92
93 /////////////////////////////////////////////////////////////////////////////
94 // CAsnCacheTestApplication::
95
96
97 class CAsnCacheTestApplication : public CNcbiApplication
98 {
99 private:
100 virtual void Init(void);
101 virtual int Run(void);
102 virtual void Exit(void);
103
104 private: //member functions
105
106 bool x_FindAnnotated(const CSeq_entry& entry);
107
108
109 };
110
111
112 /////////////////////////////////////////////////////////////////////////////
113 // Init test for all different types of arguments
114
115
Init(void)116 void CAsnCacheTestApplication::Init(void)
117 {
118 // Create command-line argument descriptions class
119 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
120
121 // Specify USAGE context
122 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
123 "CArgDescriptions demo program");
124
125 arg_desc->AddKey("cache", "ASNCache",
126 "Path to ASN.1 cache",
127 CArgDescriptions::eInputFile);
128
129 arg_desc->AddDefaultKey("i", "AccessionList",
130 "List of accessions to retrieve",
131 CArgDescriptions::eInputFile,
132 "-");
133
134 arg_desc->AddDefaultKey("o", "OutputFile",
135 "File to place ASN seq-entries in",
136 CArgDescriptions::eOutputFile,
137 "-");
138
139 arg_desc->AddFlag("test-loader", "Test use of the ASN cache data loader");
140
141 arg_desc->AddFlag("raw", "Test raw retrieval only");
142 arg_desc->AddFlag("text", "Use ASN.1 text output");
143 arg_desc->AddFlag("find-annotated", "Find annotated accessions");
144 arg_desc->AddFlag("dump-GP-7574", "Dump comments and seq-descs");
145 arg_desc->AddFlag("dump-GP-8763", "Dump qualifiers");
146 arg_desc->AddFlag("dump-proteins", "Dump protein seqs");
147 arg_desc->AddFlag("no-serialize", "Do not reserialize the ASN.1");
148 arg_desc->AddFlag("random-order", "Retrieve sequences in random order");
149 arg_desc->AddFlag("test-warm", "Retrieve sequences twice, to check differences between cold-cache and warm-cache times");
150
151 arg_desc->AddFlag("indexonly", "Print the index entry only, do not fetch the blob." );
152
153 arg_desc->AddFlag("idonly", "Verify that ID information in the cache is available and accurate for the listed accessions." );
154 arg_desc->AddFlag("verify-ids", "Verify that ID information in the cache is available and accurate for the listed accessions." );
155 arg_desc->SetDependency("raw", CArgDescriptions::eExcludes, "indexonly");
156 arg_desc->SetDependency("verify-ids", CArgDescriptions::eExcludes, "raw");
157 arg_desc->SetDependency("verify-ids", CArgDescriptions::eExcludes, "indexonly");
158 arg_desc->SetDependency("test-loader", CArgDescriptions::eExcludes, "raw");
159 arg_desc->SetDependency("test-loader", CArgDescriptions::eExcludes, "indexonly");
160 arg_desc->SetDependency("idonly", CArgDescriptions::eExcludes, "raw");
161 arg_desc->SetDependency("idonly", CArgDescriptions::eExcludes, "indexonly");
162 arg_desc->SetDependency("idonly", CArgDescriptions::eExcludes, "verify-ids");
163 arg_desc->SetDependency("no-serialize", CArgDescriptions::eExcludes, "indexonly");
164 arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "raw");
165 arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "indexonly");
166 arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "idonly");
167 arg_desc->SetDependency("text", CArgDescriptions::eExcludes, "no-serialize");
168
169 arg_desc->AddFlag("get-multiple",
170 "If several entries match the specified id, get all of "
171 "them, not only latest one");
172 arg_desc->SetDependency("get-multiple", CArgDescriptions::eExcludes, "idonly");
173 arg_desc->SetDependency("get-multiple", CArgDescriptions::eExcludes, "test-loader");
174
175 // Setup arg.descriptions for this application
176 SetupArgDescriptions(arg_desc.release());
177 }
178
179
180
Run(void)181 int CAsnCacheTestApplication::Run(void)
182 {
183 // Get arguments
184 const CArgs& args = GetArgs();
185 CNcbiOstream& ostr = args["o"].AsOutputFile();
186 CNcbiIstream& istr = args["i"].AsInputFile();
187
188 bool readIndexOnly = args[ "indexonly" ];
189 bool raw = args["raw"];
190 bool serialize = !args["no-serialize"];
191 bool verify_ids = args["verify-ids"];
192 bool getIdOnly = args["idonly"];
193 bool multiple = args["get-multiple"];
194
195 vector< CConstRef<CSeq_entry> > entries;
196 vector< CDataLoader::TIds > id_sets;
197 vector< CSeq_id_Handle > ids;
198
199 auto_ptr<CObjectOStream> os;
200
201 string line;
202 while (NcbiGetlineEOL(istr, line)) {
203 if (line.empty() || line[0] == '#') {
204 continue;
205 }
206 try {
207 CSeq_id id(line);
208 ids.push_back(CSeq_id_Handle::GetHandle(id));
209 }
210 catch (CException& e) {
211 LOG_POST(Error << "failed to convert "
212 << line << " to a SeqId: " << e.what());
213 }
214 }
215
216 if (serialize) {
217 if(getIdOnly) {
218 id_sets.reserve(ids.size());
219 }
220 else {
221 entries.reserve(ids.size());
222 }
223 }
224
225
226 CRef<CDataLoader> loader;
227 CRef<CAsnCache> cache;
228 CRef<CDataSource> source;
229 CRef<CScope> m_LocalCacheScope;
230
231 if(args["test-loader"]){
232 loader.Reset(CAsnCache_DataLoader::TDbMaker(args["cache"].AsString()).CreateLoader());
233 source.Reset(new CDataSource(*loader));
234 } else
235 cache.Reset(new CAsnCache(args["cache"].AsString()));
236 if(args["dump-proteins"]){
237 m_LocalCacheScope.Reset(new CScope (*CObjectManager::GetInstance()));
238 string cache_path = args["cache"].AsString();
239 CAsnCache_DataLoader::RegisterInObjectManager(*CObjectManager::GetInstance(), cache_path, CObjectManager::eDefault, 1 );
240 string loader_name = CAsnCache_DataLoader::GetLoaderNameFromArgs(cache_path);
241 if ( CObjectManager::GetInstance()->FindDataLoader(loader_name ) ) {
242 loader_name = CObjectManager::GetInstance()->FindDataLoader(loader_name )->GetName() ;
243 } else {
244 }
245 m_LocalCacheScope->AddDataLoader(loader_name );
246 }
247
248
249 int num_cycles = args["test-warm"] ? 2 : 1;
250 size_t count_failed = 0;
251
252 for(int cycle = 0; cycle < num_cycles; cycle++){
253 if (args["random-order"]) {
254 random_shuffle(ids.begin(), ids.end());
255 }
256
257 size_t count = 0;
258 CStopWatch sw;
259 sw.Start();
260
261 ITERATE(vector<CSeq_id_Handle>, id_it, ids){
262 try {
263 if ( readIndexOnly ) {
264 vector<CAsnIndex::SIndexInfo> info;
265
266 if (multiple) {
267 cache->GetMultipleIndexEntries( *id_it, info);
268 } else {
269 info.resize(1);
270 cache->GetIndexEntry( *id_it, info[0]);
271 }
272
273 ITERATE (vector<CAsnIndex::SIndexInfo>, info_it, info) {
274 ostr << *info_it << endl;
275 }
276 } else if (raw) {
277 vector<CAsnCache::TBuffer> buffer(multiple ? 0 : 1);
278 bool success = multiple ? cache->GetMultipleRaw(*id_it, buffer)
279 : cache->GetRaw(*id_it, buffer[0]);
280 if (success) {
281 if(serialize && cycle == 0) {
282 ITERATE (vector<CAsnCache::TBuffer>, buf_it, buffer) {
283 ostr.write((const char*)&(*buf_it)[0],
284 buf_it->size());
285 }
286 }
287 } else {
288 LOG_POST(Error << "failed to retrieve: "
289 << id_it->GetSeqId()->AsFastaString());
290 ++count_failed;
291 }
292 } else if (getIdOnly) {
293 CDataLoader::TIds id_set;
294 if(loader)
295 loader->GetIds(*id_it, id_set);
296 else
297 cache->GetSeqIds(*id_it, id_set);
298 if(id_set.empty()) {
299 LOG_POST(Error << "failed to retrieve: "
300 << id_it->GetSeqId()->AsFastaString());
301 ++count_failed;
302 } else if(serialize && cycle==0)
303 id_sets.push_back(id_set);
304 } else {
305 vector< CRef<CSeq_entry> > entries_for_id;
306 if (multiple) {
307 entries_for_id = cache->GetMultipleEntries(*id_it);
308 } else if (loader) {
309 entries_for_id.push_back(CRef<CSeq_entry>(const_cast<CSeq_entry *>
310 ((*loader->GetRecords(*id_it, CDataLoader::eBioseq).begin())->GetCompleteTSE().GetPointer())));
311 } else {
312 entries_for_id.push_back(cache->GetEntry(*id_it));
313 }
314 ITERATE (vector< CRef<CSeq_entry> >, entry_it, entries_for_id) {
315 CRef<CSeq_entry> entry = *entry_it;
316 if(args["find-annotated"]) {
317 bool is_annotated = x_FindAnnotated(*entry);
318 CDataLoader::TIds id_set;
319 if(loader) {
320 loader->GetIds(*id_it, id_set);
321 }
322 else {
323 cache->GetSeqIds(*id_it, id_set);
324 }
325 cout<<"is_annotated" << "\t"
326 << "original" << "\t"
327 << *id_it << "\t"
328 << boolalpha << is_annotated << endl;
329 ITERATE(CDataLoader::TIds, id_it, id_set) {
330 cout<<"is_annotated" << "\t"
331 << "alias" << "\t"
332 << *id_it << "\t"
333 << boolalpha << is_annotated << endl;
334 }
335 }
336 if(entry && args["dump-GP-7574"]) {
337 cerr << *id_it << endl;
338 for(CTypeConstIterator<CSeqdesc> desc(*entry); desc; ++desc) {
339 switch ( desc->Which() ) {
340 case CSeqdesc::e_User:
341 case CSeqdesc::e_Comment:
342 cout << MSerial_AsnText << *desc;
343 break;
344 default: break;
345 }
346 }
347 }
348 if(entry && args["dump-GP-8763"]) {
349 cerr << *id_it << endl; bool first=true;
350 for(CTypeConstIterator<CGb_qual> desc(*entry); desc; ++desc) {
351 if(desc->GetQual() == "inference" ||
352 desc->GetQual() == "experiment"
353 ) {
354 if(first) { cout << *id_it << endl; } first=false;
355 cout << MSerial_AsnText << *desc;
356 }
357 }
358 }
359
360
361 if(entry && args["dump-proteins"]) {
362 for(CTypeConstIterator<CSeq_feat> feat(*entry); feat; ++feat) {
363 if( feat->GetData().Which() == CSeqFeatData::e_Cdregion &&
364 feat->IsSetProduct() ) {
365 CSeq_id_Handle idh = CSeq_id_Handle::GetHandle(*feat->GetProduct().GetId());
366 CBioseq_Handle bsh = m_LocalCacheScope->GetBioseqHandle(idh);
367 if( bsh.CanGetInst() ) {
368 if (
369 bsh.GetInst().IsSetExt() &&
370 bsh.GetInst().GetExt().IsDelta() ) {
371 ITERATE(CDelta_ext::Tdata, delta, bsh.GetInst().GetExt().GetDelta().Get() ) {
372 if( (*delta)->IsLoc() ) {
373 idh = CSeq_id_Handle::GetHandle( *(*delta)->GetLoc().GetId() );
374 break;
375 }
376 }
377 }
378 } else {
379 LOG_POST(Warning<<"Can't get seqinst for " << idh);
380 }
381 cout << *id_it << "\t"
382 << idh << endl;
383 //auto_ptr<CBestPigMapper> m_mapper
384 }
385 }
386 }
387 if (entry) {
388 if (serialize && cycle==0) {
389 if( ! args["find-annotated"])
390 entries.push_back(entry);
391 }
392 if(verify_ids){
393 CDataLoader::TIds retrieved_ids;
394 if(loader)
395 loader->GetIds(*id_it, retrieved_ids);
396 else
397 cache->GetSeqIds(*id_it, retrieved_ids);
398 if(retrieved_ids.empty()){
399 LOG_POST(Error << "failed to retrieve ids for: "
400 << id_it->GetSeqId()->AsFastaString());
401 ++count_failed;
402 } else if(!s_SameIds(s_GetSeqIds(retrieved_ids), ExtractBioseq(entry, *id_it)->GetId()))
403 {
404 LOG_POST(Error << "Mismatched ids for: "
405 << id_it->GetSeqId()->AsFastaString() << ": retrieved:");
406 ITERATE(vector<CSeq_id_Handle>, it, retrieved_ids)
407 LOG_POST(Error << it->GetSeqId()->AsFastaString());
408 LOG_POST(Error << "IDs in entry:");
409 ITERATE(CBioseq::TId, it, ExtractBioseq(entry, *id_it)->GetId())
410 LOG_POST(Error << (*it)->AsFastaString());
411 ++count_failed;
412 } else
413 LOG_POST(Info << "Succesfully retrieved " << retrieved_ids.size()
414 << " ids for " << id_it->GetSeqId()->AsFastaString());
415 }
416 } else {
417 LOG_POST(Error << "failed to retrieve: "
418 << id_it->GetSeqId()->AsFastaString());
419 ++count_failed;
420 }
421 }
422 }
423
424 ++count;
425 }
426 catch (CException& e) {
427 LOG_POST(Error << "failed to retrieve "
428 << line << ": " << e.what());
429 }
430 }
431
432 double e = sw.Elapsed();
433 LOG_POST(Error << "done cycle " << cycle+1 << ", " << count << " seqs / " << e << " seconds = "
434 << count / e << " seqs/sec ("
435 << count_failed << " failed to retrieve)");
436 }
437 if(args["dump-proteins"]) { return count_failed; }
438 if(serialize) {
439 if(getIdOnly){
440 ITERATE(vector< CDataLoader::TIds >, it, id_sets){
441 ITERATE(CDataLoader::TIds, id_it, *it){
442 if(id_it != it->begin())
443 ostr << ", ";
444 ostr << id_it->GetSeqId()->AsFastaString();
445 }
446 ostr << endl;
447 }
448 } else {
449 if (args["text"]) {
450 os.reset(CObjectOStream::Open(eSerial_AsnText, ostr));
451 } else {
452 os.reset(CObjectOStream::Open(eSerial_AsnBinary, ostr));
453 }
454 ITERATE(vector< CConstRef<CSeq_entry> >, it, entries)
455 *os << **it;
456 }
457 }
458
459 return count_failed;
460 }
461
x_FindAnnotated(const CSeq_entry & entry)462 bool CAsnCacheTestApplication::x_FindAnnotated(const CSeq_entry& entry)
463 {
464 return
465 ( entry.IsSeq() && entry.GetSeq().IsSetAnnot() ) ||
466 ( entry.IsSet() && entry.GetSet().IsSetAnnot() );
467
468 }
469
470 /////////////////////////////////////////////////////////////////////////////
471 // Cleanup
472
473
Exit(void)474 void CAsnCacheTestApplication::Exit(void)
475 {
476 SetDiagStream(0);
477 }
478
479
480 /////////////////////////////////////////////////////////////////////////////
481 // MAIN
482
483
main(int argc,const char * argv[])484 int main(int argc, const char* argv[])
485 {
486 // Execute main application function
487 return CAsnCacheTestApplication().AppMain(argc, argv);
488 }
489