1 /* $Id: test_csra_loader_mt.cpp 632472 2021-06-02 11:12:38Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Eugene Vasilchenko
27 *
28 * File Description:
29 * Sample test application for cSRA data loader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbi_system.hpp>
37 #include <corelib/request_ctx.hpp>
38 #include <corelib/test_mt.hpp>
39 #include <util/random_gen.hpp>
40 #include <sra/data_loaders/csra/csraloader.hpp>
41 #include <sra/readers/ncbi_traces_path.hpp>
42 #include <sra/readers/sra/csraread.hpp>
43
44 #include <objects/general/general__.hpp>
45 #include <objects/seq/seq__.hpp>
46 #include <objects/seqset/seqset__.hpp>
47 #include <objects/seqalign/seqalign__.hpp>
48 #include <objects/seqres/seqres__.hpp>
49
50 #include <objmgr/scope.hpp>
51
52 #include <common/test_assert.h> /* This header must go last */
53
54 USING_NCBI_SCOPE;
55 USING_SCOPE(objects);
56
57 /////////////////////////////////////////////////////////////////////////////
58 // CCSRATestApp::
59
60
61 class CCSRATestApp : public CThreadedApp
62 {
63 private:
64 virtual bool Thread_Run(int idx);
65 virtual bool TestApp_Init(void);
66 virtual bool TestApp_Exit(void);
67 virtual bool TestApp_Args(CArgDescriptions& args);
68
69 CSeq_id_Handle GetHandle(const string& acc,
70 Int8 spot_id,
71 int read_id) const;
72 bool ExistsSpotId(CScope& scope,
73 const string& acc,
74 Int8 spot_id);
75 Int8 FindMaxSpotId(const string& acc);
76 void LoadRefSeqs();
77
78 bool TestShortReads(int idx);
79 bool TestRefSeqs(int idx);
80
81 bool m_Verbose;
82 bool m_PreLoad;
83 bool m_FullSeq;
84 bool m_ResetHistory;
85 bool m_TestRefSeqs;
86 int m_Seed;
87 int m_IterCount, m_IterSize;
88 int m_ErrorCount;
89 vector<string> m_Accession;
90 vector<Int8> m_MaxSpotId;
91 map<string, vector<CSeq_id_Handle> > m_RefIds;
92
93 CRef<CObjectManager> m_OM;
94 CRef<CScope> m_SharedScope;
95 };
96
97
98 /////////////////////////////////////////////////////////////////////////////
99 // Init test
TestApp_Args(CArgDescriptions & args)100 bool CCSRATestApp::TestApp_Args(CArgDescriptions& args)
101 {
102 // Specify USAGE context
103 args.SetUsageContext(GetArguments().GetProgramBasename(),
104 "test_csra_loader_mt");
105
106 args.AddDefaultKey("accs", "Accessions",
107 "comma separated SRA accession list",
108 CArgDescriptions::eString,
109 "SRR000010");
110 //"SRR000010,SRR389414,SRR494733,SRR505887,SRR035417");
111 args.AddOptionalKey("accs-file", "Accessions",
112 "file with SRA accession list",
113 CArgDescriptions::eInputFile);
114 args.AddDefaultKey("iter-count", "IterationCount",
115 "Number of read iterations",
116 CArgDescriptions::eInteger,
117 "10");
118 args.AddDefaultKey("iter-size", "IterationSize",
119 "Number of sequential sequences in one iteration",
120 CArgDescriptions::eInteger,
121 "10");
122 args.AddFlag("verbose", "Print info about progress");
123 args.AddFlag("preload", "Try to preload libraries in main thread");
124 args.AddFlag("full-seq", "Load full sequence");
125 args.AddFlag("shared-scope", "Use shared scope in all threads");
126 args.AddFlag("reset-history", "Reset scope's history after each iteration");
127 args.AddFlag("reference-sequences", "Test reference sequences retrieval");
128
129 return true;
130 }
131
132
TestApp_Init(void)133 bool CCSRATestApp::TestApp_Init(void)
134 {
135 SetDiagPostLevel(eDiag_Info);
136 const CArgs& args = GetArgs();
137 m_Verbose = args["verbose"];
138 m_ErrorCount = 0;
139 m_Seed = args["seed"]? args["seed"].AsInteger(): int(time(0));
140 if ( m_Verbose ) {
141 LOG_POST(Info<<"Seed: "<<m_Seed);
142 }
143 NStr::Split(args["accs"].AsString(), ",", m_Accession);
144 if ( args["accs-file"] ) {
145 m_Accession.clear();
146 CNcbiIstream& in = args["accs-file"].AsInputFile();
147 string acc;
148 while ( in >> acc ) {
149 m_Accession.push_back(acc);
150 }
151 }
152 if ( m_Accession.empty() ) {
153 ERR_POST(Fatal<<"empty accession list");
154 }
155 m_IterCount = args["iter-count"].AsInteger();
156 m_IterSize = args["iter-size"].AsInteger();
157 m_MaxSpotId.assign(m_Accession.size(), 0);
158 m_OM = CObjectManager::GetInstance();
159 CCSRADataLoader::RegisterInObjectManager(*m_OM, CObjectManager::eDefault);
160 if ( args["shared-scope"] ) {
161 m_SharedScope = new CScope(*m_OM);
162 m_SharedScope->AddDefaults();
163 }
164 m_ResetHistory = args["reset-history"];
165 m_TestRefSeqs = args["reference-sequences"];
166 m_FullSeq = args["full-seq"];
167 if ( m_TestRefSeqs ) {
168 LoadRefSeqs();
169 }
170 if ( args["preload"] ) {
171 Thread_Run(-1);
172 }
173 return true;
174 }
175
176
TestApp_Exit(void)177 bool CCSRATestApp::TestApp_Exit(void)
178 {
179 if ( m_ErrorCount ) {
180 ERR_POST("Errors found: "<<m_ErrorCount);
181 }
182 else {
183 LOG_POST("Done.");
184 }
185 return !m_ErrorCount;
186 }
187
188 /////////////////////////////////////////////////////////////////////////////
189 // Run test
190 /////////////////////////////////////////////////////////////////////////////
191
s_Check(const CBioseq & seq)192 void s_Check(const CBioseq& seq)
193 {
194 _ASSERT(!seq.GetId().empty());
195 const CSeq_inst& inst = seq.GetInst();
196 const string& seqdata = inst.GetSeq_data().GetIupacna().Get();
197 _ASSERT(seqdata.size() == inst.GetLength());
198 ITERATE ( string, i, seqdata ) {
199 _ASSERT(*i >= 'A' && *i <= 'Z');
200 }
201 }
202
s_AsFASTA(const CBioseq & seq)203 string s_AsFASTA(const CBioseq& seq)
204 {
205 return seq.GetId().front()->AsFastaString()+" "+
206 seq.GetInst().GetSeq_data().GetIupacna().Get();
207 }
208
GetHandle(const string & acc,Int8 spot_id,int read_id) const209 CSeq_id_Handle CCSRATestApp::GetHandle(const string& acc,
210 Int8 spot_id,
211 int read_id) const
212 {
213 CNcbiOstrstream str;
214 str << "gnl|SRA|" << acc << '.' << spot_id << '.' << read_id;
215 return CSeq_id_Handle::GetHandle(CNcbiOstrstreamToString(str));
216 }
217
ExistsSpotId(CScope & scope,const string & acc,Int8 spot_id)218 bool CCSRATestApp::ExistsSpotId(CScope& scope,
219 const string& acc,
220 Int8 spot_id)
221 {
222 for ( int read_id = 1; read_id <= 4; ++read_id ) {
223 if ( !scope.GetIds(GetHandle(acc, spot_id, read_id)).empty() ) {
224 return true;
225 }
226 }
227 return false;
228 }
229
FindMaxSpotId(const string & acc)230 Int8 CCSRATestApp::FindMaxSpotId(const string& acc)
231 {
232 CScope scope(*m_OM);
233 scope.AddDefaults();
234 Int8 a = 0;
235 Int8 b = 0xfffffff;
236 while ( ExistsSpotId(scope, acc, b) ) {
237 a = b;
238 b *= 2;
239 }
240 while ( b-a > 1 ) {
241 Int8 m = (a+b)/2;
242 if ( ExistsSpotId(scope, acc, m) ) {
243 a = m;
244 }
245 else {
246 b = m;
247 }
248 }
249 return a;
250 }
251
LoadRefSeqs()252 void CCSRATestApp::LoadRefSeqs()
253 {
254 CVDBMgr mgr;
255 for ( auto& acc : m_Accession ) {
256 vector<CSeq_id_Handle> ids;
257 CCSraDb db(mgr, acc);
258 for ( CCSraRefSeqIterator it(db); it; ++it ) {
259 ids.push_back(CSeq_id_Handle::GetHandle("gnl|SRA|"+acc+"/"+it->m_Name));
260 }
261 if ( m_Verbose ) {
262 LOG_POST(Info<<": "<<acc<<" has "<<ids.size()<<" reference sequences");
263 }
264 m_RefIds[acc] = ids;;
265 }
266 }
267
Thread_Run(int idx)268 bool CCSRATestApp::Thread_Run(int idx)
269 {
270 CDiagContext::GetRequestContext().SetClientIP("1.2.3."+to_string(idx));
271 CDiagContext::GetRequestContext().SetSessionID("session_"+to_string(idx));
272 CDiagContext::GetRequestContext().SetHitID("hit_"+to_string(idx));
273 if ( m_TestRefSeqs ) {
274 return TestRefSeqs(idx);
275 }
276 else {
277 return TestShortReads(idx);
278 }
279 }
280
TestShortReads(int idx)281 bool CCSRATestApp::TestShortReads(int idx)
282 {
283 CRandom random(m_Seed+idx);
284 for ( int ti = 0; ti < m_IterCount; ++ti ) {
285 size_t index = random.GetRandIndexSize_t(m_Accession.size());
286 const string& acc = m_Accession[index];
287 if ( m_Verbose ) {
288 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc);
289 }
290
291 if ( !m_MaxSpotId[index] ) {
292 m_MaxSpotId[index] = FindMaxSpotId(acc);
293 if ( m_Verbose ) {
294 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
295 <<": max id = " << m_MaxSpotId[index]);
296 }
297 _ASSERT(m_MaxSpotId[index] > 0);
298 }
299 Int8 count = min(m_MaxSpotId[index], Int8(m_IterSize));
300 Int8 start_id = random.GetRandUint8(1, m_MaxSpotId[index]-count);
301 Int8 stop_id = start_id+count;
302 if ( m_Verbose ) {
303 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
304 <<": scan " << start_id<<" - "<<(stop_id-1));
305 }
306 CRef<CScope> scope_ref = m_SharedScope;
307 if ( !scope_ref ) {
308 scope_ref = new CScope(*m_OM);
309 scope_ref->AddDefaults();
310 }
311 CScope& scope = *scope_ref;
312 size_t seq_count = 0;
313 for ( Int8 spot_id = start_id; spot_id < stop_id; ++spot_id ) {
314 for ( int read_id = 1; read_id <= 4; ++read_id ) {
315 CSeq_id_Handle id = GetHandle(acc, spot_id, read_id);
316 _ASSERT(!scope.GetAccVer(id));
317 _ASSERT(scope.GetGi(id) == ZERO_GI);
318 if ( scope.GetIds(id).empty() ) {
319 _ASSERT(scope.GetSequenceLength(id) == kInvalidSeqPos);
320 _ASSERT(scope.GetSequenceType(id) == CSeq_inst::eMol_not_set);
321 _ASSERT(!scope.GetBioseqHandle(id));
322 _ASSERT(scope.GetTaxId(id) == INVALID_TAX_ID);
323 continue;
324 }
325 ++seq_count;
326 _ASSERT(scope.GetIds(id).front() == id);
327 _ASSERT(scope.GetSequenceType(id) == CSeq_inst::eMol_na);
328 TSeqPos len = scope.GetSequenceLength(id);
329 TTaxId taxid = scope.GetTaxId(id);
330
331 if ( !m_FullSeq ) {
332 if ( m_Verbose ) {
333 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
334 <<": "<<len);
335 }
336 continue;
337 }
338 CBioseq_Handle bh = scope.GetBioseqHandle(id);
339 _ASSERT(bh);
340 _ASSERT(bh.GetBioseqLength() == len);
341 _ASSERT(scope.GetTaxId(id) == taxid);
342 CConstRef<CBioseq> seq = bh.GetCompleteObject();
343 s_Check(*seq);
344 if ( true ) {
345 if ( m_Verbose ) {
346 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
347 <<": "<<s_AsFASTA(*seq));
348 }
349 }
350 else {
351 if ( m_Verbose ) {
352 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": acc["<<index<<"] "<<acc
353 <<": "<<bh.GetSeqId());
354 }
355 }
356 }
357 }
358 _ASSERT(seq_count);
359 if ( m_ResetHistory ) {
360 scope.ResetHistory();
361 }
362 }
363 return true;
364 }
365
366
TestRefSeqs(int idx)367 bool CCSRATestApp::TestRefSeqs(int idx)
368 {
369 CRandom random(m_Seed+idx);
370 for ( int ti = 0; ti < m_IterCount; ++ti ) {
371 size_t index = random.GetRandIndexSize_t(m_Accession.size());
372 const string& acc = m_Accession[index];
373 vector<CSeq_id_Handle>& ids = m_RefIds[acc];
374 if ( ids.empty() ) {
375 continue;
376 }
377 size_t ref_index = random.GetRandIndexSize_t(ids.size());
378 const CSeq_id_Handle& id = ids[ref_index];
379 if ( m_Verbose ) {
380 LOG_POST(Info<<"T"<<idx<<"."<<ti<<": ref["<<index<<"]["<<ref_index<<"] "<<id);
381 }
382
383 CRef<CScope> scope_ref = m_SharedScope;
384 if ( !scope_ref ) {
385 scope_ref = new CScope(*m_OM);
386 scope_ref->AddDefaults();
387 }
388 CScope& scope = *scope_ref;
389 {{
390 TSeqPos len = scope.GetSequenceLength(id);
391 CBioseq_Handle bh = scope.GetBioseqHandle(id);
392 _ASSERT(bh);
393 _ASSERT(bh.GetBioseqLength() == len);
394 }}
395 if ( m_ResetHistory ) {
396 scope.ResetHistory();
397 }
398 }
399 return true;
400 }
401
402
403 /////////////////////////////////////////////////////////////////////////////
404 // Cleanup
405
406
407 /////////////////////////////////////////////////////////////////////////////
408 // MAIN
409
410
main(int argc,const char * argv[])411 int main(int argc, const char* argv[])
412 {
413 // Execute main application function
414 return CCSRATestApp().AppMain(argc, argv);
415 }
416