1 /*  $Id: test_csra_loader_mt.cpp 632472 2021-06-02 11:12:38Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Eugene Vasilchenko
27  *
28  * File Description:
29  *   Sample test application for cSRA data loader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbi_system.hpp>
37 #include <corelib/request_ctx.hpp>
38 #include <corelib/test_mt.hpp>
39 #include <util/random_gen.hpp>
40 #include <sra/data_loaders/csra/csraloader.hpp>
41 #include <sra/readers/ncbi_traces_path.hpp>
42 #include <sra/readers/sra/csraread.hpp>
43 
44 #include <objects/general/general__.hpp>
45 #include <objects/seq/seq__.hpp>
46 #include <objects/seqset/seqset__.hpp>
47 #include <objects/seqalign/seqalign__.hpp>
48 #include <objects/seqres/seqres__.hpp>
49 
50 #include <objmgr/scope.hpp>
51 
52 #include <common/test_assert.h>  /* This header must go last */
53 
54 USING_NCBI_SCOPE;
55 USING_SCOPE(objects);
56 
57 /////////////////////////////////////////////////////////////////////////////
58 //  CCSRATestApp::
59 
60 
61 class CCSRATestApp : public CThreadedApp
62 {
63 private:
64     virtual bool Thread_Run(int idx);
65     virtual bool TestApp_Init(void);
66     virtual bool TestApp_Exit(void);
67     virtual bool TestApp_Args(CArgDescriptions& args);
68 
69     CSeq_id_Handle GetHandle(const string& acc,
70                              Int8 spot_id,
71                              int read_id) const;
72     bool ExistsSpotId(CScope& scope,
73                       const string& acc,
74                       Int8 spot_id);
75     Int8 FindMaxSpotId(const string& acc);
76     void LoadRefSeqs();
77 
78     bool TestShortReads(int idx);
79     bool TestRefSeqs(int idx);
80 
81     bool m_Verbose;
82     bool m_PreLoad;
83     bool m_FullSeq;
84     bool m_ResetHistory;
85     bool m_TestRefSeqs;
86     int m_Seed;
87     int m_IterCount, m_IterSize;
88     int m_ErrorCount;
89     vector<string> m_Accession;
90     vector<Int8> m_MaxSpotId;
91     map<string, vector<CSeq_id_Handle> > m_RefIds;
92 
93     CRef<CObjectManager> m_OM;
94     CRef<CScope> m_SharedScope;
95 };
96 
97 
98 /////////////////////////////////////////////////////////////////////////////
99 //  Init test
TestApp_Args(CArgDescriptions & args)100 bool CCSRATestApp::TestApp_Args(CArgDescriptions& args)
101 {
102     // Specify USAGE context
103     args.SetUsageContext(GetArguments().GetProgramBasename(),
104                          "test_csra_loader_mt");
105 
106     args.AddDefaultKey("accs", "Accessions",
107                        "comma separated SRA accession list",
108                        CArgDescriptions::eString,
109                        "SRR000010");
110                      //"SRR000010,SRR389414,SRR494733,SRR505887,SRR035417");
111     args.AddOptionalKey("accs-file", "Accessions",
112                         "file with SRA accession list",
113                         CArgDescriptions::eInputFile);
114     args.AddDefaultKey("iter-count", "IterationCount",
115                        "Number of read iterations",
116                        CArgDescriptions::eInteger,
117                        "10");
118     args.AddDefaultKey("iter-size", "IterationSize",
119                        "Number of sequential sequences in one iteration",
120                        CArgDescriptions::eInteger,
121                        "10");
122     args.AddFlag("verbose", "Print info about progress");
123     args.AddFlag("preload", "Try to preload libraries in main thread");
124     args.AddFlag("full-seq", "Load full sequence");
125     args.AddFlag("shared-scope", "Use shared scope in all threads");
126     args.AddFlag("reset-history", "Reset scope's history after each iteration");
127     args.AddFlag("reference-sequences", "Test reference sequences retrieval");
128 
129     return true;
130 }
131 
132 
TestApp_Init(void)133 bool CCSRATestApp::TestApp_Init(void)
134 {
135     SetDiagPostLevel(eDiag_Info);
136     const CArgs& args = GetArgs();
137     m_Verbose = args["verbose"];
138     m_ErrorCount = 0;
139     m_Seed = args["seed"]? args["seed"].AsInteger(): int(time(0));
140     if ( m_Verbose ) {
141         LOG_POST(Info<<"Seed: "<<m_Seed);
142     }
143     NStr::Split(args["accs"].AsString(), ",", m_Accession);
144     if ( args["accs-file"] ) {
145         m_Accession.clear();
146         CNcbiIstream& in = args["accs-file"].AsInputFile();
147         string acc;
148         while ( in >> acc ) {
149             m_Accession.push_back(acc);
150         }
151     }
152     if ( m_Accession.empty() ) {
153         ERR_POST(Fatal<<"empty accession list");
154     }
155     m_IterCount = args["iter-count"].AsInteger();
156     m_IterSize = args["iter-size"].AsInteger();
157     m_MaxSpotId.assign(m_Accession.size(), 0);
158     m_OM = CObjectManager::GetInstance();
159     CCSRADataLoader::RegisterInObjectManager(*m_OM, CObjectManager::eDefault);
160     if ( args["shared-scope"] ) {
161         m_SharedScope = new CScope(*m_OM);
162         m_SharedScope->AddDefaults();
163     }
164     m_ResetHistory = args["reset-history"];
165     m_TestRefSeqs = args["reference-sequences"];
166     m_FullSeq = args["full-seq"];
167     if ( m_TestRefSeqs ) {
168         LoadRefSeqs();
169     }
170     if ( args["preload"] ) {
171         Thread_Run(-1);
172     }
173     return true;
174 }
175 
176 
TestApp_Exit(void)177 bool CCSRATestApp::TestApp_Exit(void)
178 {
179     if ( m_ErrorCount ) {
180         ERR_POST("Errors found: "<<m_ErrorCount);
181     }
182     else {
183         LOG_POST("Done.");
184     }
185     return !m_ErrorCount;
186 }
187 
188 /////////////////////////////////////////////////////////////////////////////
189 //  Run test
190 /////////////////////////////////////////////////////////////////////////////
191 
s_Check(const CBioseq & seq)192 void s_Check(const CBioseq& seq)
193 {
194     _ASSERT(!seq.GetId().empty());
195     const CSeq_inst& inst = seq.GetInst();
196     const string& seqdata = inst.GetSeq_data().GetIupacna().Get();
197     _ASSERT(seqdata.size() == inst.GetLength());
198     ITERATE ( string, i, seqdata ) {
199         _ASSERT(*i >= 'A' && *i <= 'Z');
200     }
201 }
202 
s_AsFASTA(const CBioseq & seq)203 string s_AsFASTA(const CBioseq& seq)
204 {
205     return seq.GetId().front()->AsFastaString()+" "+
206         seq.GetInst().GetSeq_data().GetIupacna().Get();
207 }
208 
GetHandle(const string & acc,Int8 spot_id,int read_id) const209 CSeq_id_Handle CCSRATestApp::GetHandle(const string& acc,
210                                        Int8 spot_id,
211                                        int read_id) const
212 {
213     CNcbiOstrstream str;
214     str << "gnl|SRA|" << acc << '.' << spot_id << '.' << read_id;
215     return CSeq_id_Handle::GetHandle(CNcbiOstrstreamToString(str));
216 }
217 
ExistsSpotId(CScope & scope,const string & acc,Int8 spot_id)218 bool CCSRATestApp::ExistsSpotId(CScope& scope,
219                                 const string& acc,
220                                 Int8 spot_id)
221 {
222     for ( int read_id = 1; read_id <= 4; ++read_id ) {
223         if ( !scope.GetIds(GetHandle(acc, spot_id, read_id)).empty() ) {
224             return true;
225         }
226     }
227     return false;
228 }
229 
FindMaxSpotId(const string & acc)230 Int8 CCSRATestApp::FindMaxSpotId(const string& acc)
231 {
232     CScope scope(*m_OM);
233     scope.AddDefaults();
234     Int8 a = 0;
235     Int8 b = 0xfffffff;
236     while ( ExistsSpotId(scope, acc, b) ) {
237         a = b;
238         b *= 2;
239     }
240     while ( b-a > 1 ) {
241         Int8 m = (a+b)/2;
242         if ( ExistsSpotId(scope, acc, m) ) {
243             a = m;
244         }
245         else {
246             b = m;
247         }
248     }
249     return a;
250 }
251 
LoadRefSeqs()252 void CCSRATestApp::LoadRefSeqs()
253 {
254     CVDBMgr mgr;
255     for ( auto& acc : m_Accession ) {
256         vector<CSeq_id_Handle> ids;
257         CCSraDb db(mgr, acc);
258         for ( CCSraRefSeqIterator it(db); it; ++it ) {
259             ids.push_back(CSeq_id_Handle::GetHandle("gnl|SRA|"+acc+"/"+it->m_Name));
260         }
261         if ( m_Verbose ) {
262             LOG_POST(Info<<": "<<acc<<" has "<<ids.size()<<" reference sequences");
263         }
264         m_RefIds[acc] = ids;;
265     }
266 }
267 
Thread_Run(int idx)268 bool CCSRATestApp::Thread_Run(int idx)
269 {
270     CDiagContext::GetRequestContext().SetClientIP("1.2.3."+to_string(idx));
271     CDiagContext::GetRequestContext().SetSessionID("session_"+to_string(idx));
272     CDiagContext::GetRequestContext().SetHitID("hit_"+to_string(idx));
273     if ( m_TestRefSeqs ) {
274         return TestRefSeqs(idx);
275     }
276     else {
277         return TestShortReads(idx);
278     }
279 }
280 
TestShortReads(int idx)281 bool CCSRATestApp::TestShortReads(int idx)
282 {
283     CRandom random(m_Seed+idx);
284     for ( int ti = 0; ti < m_IterCount; ++ti ) {
285         size_t index = random.GetRandIndexSize_t(m_Accession.size());
286         const string& acc = m_Accession[index];
287         if ( m_Verbose ) {
288             LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc);
289         }
290 
291         if ( !m_MaxSpotId[index] ) {
292             m_MaxSpotId[index] = FindMaxSpotId(acc);
293             if ( m_Verbose ) {
294                 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
295                          <<": max id = " << m_MaxSpotId[index]);
296             }
297             _ASSERT(m_MaxSpotId[index] > 0);
298         }
299         Int8 count = min(m_MaxSpotId[index], Int8(m_IterSize));
300         Int8 start_id = random.GetRandUint8(1, m_MaxSpotId[index]-count);
301         Int8 stop_id = start_id+count;
302         if ( m_Verbose ) {
303             LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
304                      <<": scan " << start_id<<" - "<<(stop_id-1));
305         }
306         CRef<CScope> scope_ref = m_SharedScope;
307         if ( !scope_ref ) {
308             scope_ref = new CScope(*m_OM);
309             scope_ref->AddDefaults();
310         }
311         CScope& scope = *scope_ref;
312         size_t seq_count = 0;
313         for ( Int8 spot_id = start_id; spot_id < stop_id; ++spot_id ) {
314             for ( int read_id = 1; read_id <= 4; ++read_id ) {
315                 CSeq_id_Handle id = GetHandle(acc, spot_id, read_id);
316                 _ASSERT(!scope.GetAccVer(id));
317                 _ASSERT(scope.GetGi(id) == ZERO_GI);
318                 if ( scope.GetIds(id).empty() ) {
319                     _ASSERT(scope.GetSequenceLength(id) == kInvalidSeqPos);
320                     _ASSERT(scope.GetSequenceType(id) == CSeq_inst::eMol_not_set);
321                     _ASSERT(!scope.GetBioseqHandle(id));
322                     _ASSERT(scope.GetTaxId(id) == INVALID_TAX_ID);
323                     continue;
324                 }
325                 ++seq_count;
326                 _ASSERT(scope.GetIds(id).front() == id);
327                 _ASSERT(scope.GetSequenceType(id) == CSeq_inst::eMol_na);
328                 TSeqPos len = scope.GetSequenceLength(id);
329                 TTaxId taxid = scope.GetTaxId(id);
330 
331                 if ( !m_FullSeq ) {
332                     if ( m_Verbose ) {
333                         LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
334                                  <<": "<<len);
335                     }
336                     continue;
337                 }
338                 CBioseq_Handle bh = scope.GetBioseqHandle(id);
339                 _ASSERT(bh);
340                 _ASSERT(bh.GetBioseqLength() == len);
341                 _ASSERT(scope.GetTaxId(id) == taxid);
342                 CConstRef<CBioseq> seq = bh.GetCompleteObject();
343                 s_Check(*seq);
344                 if ( true ) {
345                     if ( m_Verbose ) {
346                         LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
347                                  <<": "<<s_AsFASTA(*seq));
348                     }
349                 }
350                 else {
351                     if ( m_Verbose ) {
352                         LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
353                                  <<": "<<bh.GetSeqId());
354                     }
355                 }
356             }
357         }
358         _ASSERT(seq_count);
359         if ( m_ResetHistory ) {
360             scope.ResetHistory();
361         }
362     }
363     return true;
364 }
365 
366 
TestRefSeqs(int idx)367 bool CCSRATestApp::TestRefSeqs(int idx)
368 {
369     CRandom random(m_Seed+idx);
370     for ( int ti = 0; ti < m_IterCount; ++ti ) {
371         size_t index = random.GetRandIndexSize_t(m_Accession.size());
372         const string& acc = m_Accession[index];
373         vector<CSeq_id_Handle>& ids = m_RefIds[acc];
374         if ( ids.empty() ) {
375             continue;
376         }
377         size_t ref_index = random.GetRandIndexSize_t(ids.size());
378         const CSeq_id_Handle& id = ids[ref_index];
379         if ( m_Verbose ) {
380             LOG_POST(Info<<"T"<<idx<<"."<<ti<<": ref["<<index<<"]["<<ref_index<<"] "<<id);
381         }
382 
383         CRef<CScope> scope_ref = m_SharedScope;
384         if ( !scope_ref ) {
385             scope_ref = new CScope(*m_OM);
386             scope_ref->AddDefaults();
387         }
388         CScope& scope = *scope_ref;
389         {{
390             TSeqPos len = scope.GetSequenceLength(id);
391             CBioseq_Handle bh = scope.GetBioseqHandle(id);
392             _ASSERT(bh);
393             _ASSERT(bh.GetBioseqLength() == len);
394         }}
395         if ( m_ResetHistory ) {
396             scope.ResetHistory();
397         }
398     }
399     return true;
400 }
401 
402 
403 /////////////////////////////////////////////////////////////////////////////
404 //  Cleanup
405 
406 
407 /////////////////////////////////////////////////////////////////////////////
408 //  MAIN
409 
410 
main(int argc,const char * argv[])411 int main(int argc, const char* argv[])
412 {
413     // Execute main application function
414     return CCSRATestApp().AppMain(argc, argv);
415 }
416