1 /* $Id: wgs_test.cpp 613849 2020-08-13 14:22:10Z vasilche $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Authors: Eugene Vasilchenko
27 *
28 * File Description:
29 * Sample test application for WGS reader
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbi_system.hpp>
37 #include <sra/readers/sra/wgsread.hpp>
38 #include <sra/readers/sra/wgsresolver.hpp>
39 #include <sra/readers/sra/impl/wgsresolver_impl.hpp>
40 #include <sra/readers/ncbi_traces_path.hpp>
41
42 #include <objects/general/general__.hpp>
43 #include <objects/seq/seq__.hpp>
44 #include <objects/seqset/seqset__.hpp>
45 #include <objects/seqalign/seqalign__.hpp>
46 #include <objects/seqres/seqres__.hpp>
47 #include <objects/seqsplit/ID2S_Split_Info.hpp>
48
49 #include <objmgr/object_manager.hpp>
50 #include <objmgr/seq_vector.hpp>
51
52 #include <serial/serial.hpp>
53 #include <serial/objostrasnb.hpp>
54 #include <serial/objistrasnb.hpp>
55
56 #include <util/random_gen.hpp>
57
58 #include <common/test_assert.h> /* This header must go last */
59
60 USING_NCBI_SCOPE;
61 USING_SCOPE(objects);
62
63 /////////////////////////////////////////////////////////////////////////////
64 // CWGSTestApp::
65
66
67 class CWGSTestApp : public CNcbiApplication
68 {
69 private:
70 virtual void Init(void);
71 virtual int Run(void);
72 virtual void Exit(void);
73 };
74
75
76 /////////////////////////////////////////////////////////////////////////////
77 // Init test
78
Init(void)79 void CWGSTestApp::Init(void)
80 {
81 // Create command-line argument descriptions class
82 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
83
84 // Specify USAGE context
85 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
86 "wgs_test");
87
88 arg_desc->AddOptionalKey("vol_path", "VolPath",
89 "Search path for volumes",
90 CArgDescriptions::eString);
91 arg_desc->AddDefaultKey("file", "File",
92 "WGS file name, accession, or prefix",
93 CArgDescriptions::eString,
94 "AAAA");
95
96 arg_desc->AddOptionalKey("limit_count", "LimitCount",
97 "Number of entries to read (-1 : unlimited)",
98 CArgDescriptions::eInteger);
99 arg_desc->AddFlag("verbose", "Print info about found data");
100
101 arg_desc->AddFlag("withdrawn", "Include withdrawn sequences");
102 arg_desc->AddFlag("replaced", "Include replaced sequences");
103 arg_desc->AddFlag("suppressed", "Include suppressed sequences");
104 arg_desc->AddFlag("unverified", "Include unverified sequences");
105 arg_desc->AddFlag("master", "Include master descriptors if any");
106 arg_desc->AddFlag("print-master", "Print master entry");
107 arg_desc->AddFlag("master-no-filter", "Do not filter master descriptors");
108
109 //arg_desc->AddFlag("resolve-gi", "Resolve gi list");
110
111 arg_desc->AddFlag("gi-check", "Check GI index");
112 arg_desc->AddFlag("gi-range", "Print GI range if any");
113 arg_desc->AddDefaultKey("max-gap", "MaxGap",
114 "max gap in a single gi range",
115 CArgDescriptions::eInteger, "10000");
116
117 arg_desc->AddFlag("print_seq", "Print loaded Bioseq objects");
118 arg_desc->AddFlag("print_entry", "Print loaded Seq-entry objects");
119 arg_desc->AddFlag("print_split", "Print loaded Split info object");
120
121 arg_desc->AddFlag("keep_sequences", "Keep all sequences in memory");
122
123 arg_desc->AddOptionalKey("contig_row", "ContigRow",
124 "contig row to fetch",
125 CArgDescriptions::eInt8);
126 arg_desc->AddOptionalKey("contig_version", "ContigVersion",
127 "contig version to fetch",
128 CArgDescriptions::eInteger);
129 arg_desc->AddOptionalKey("scaffold_row", "ScaffoldRow",
130 "scaffold row to fetch",
131 CArgDescriptions::eInt8);
132 arg_desc->AddOptionalKey("protein_row", "ProteinRow",
133 "protein row to fetch",
134 CArgDescriptions::eInt8);
135
136 arg_desc->AddFlag("check_non_empty_lookup",
137 "Check that lookup produce non-empty result");
138 arg_desc->AddFlag("check_empty_lookup",
139 "Check that lookup produce empty result");
140 arg_desc->AddOptionalKey("gi", "GI",
141 "lookup by GI",
142 CArgDescriptions::eIntId);
143 arg_desc->AddOptionalKey("contig_name", "ContigName",
144 "lookup by contig name",
145 CArgDescriptions::eString);
146 arg_desc->AddOptionalKey("scaffold_name", "ScaffoldName",
147 "lookup by scaffold name",
148 CArgDescriptions::eString);
149 arg_desc->AddOptionalKey("protein_name", "ProteinName",
150 "lookup by protein name",
151 CArgDescriptions::eString);
152 arg_desc->AddOptionalKey("protein_acc", "ProteinAcc",
153 "lookup by protein accession",
154 CArgDescriptions::eString);
155 arg_desc->AddDefaultKey("seed", "RandomSeed",
156 "Seed for random number generator",
157 CArgDescriptions::eInteger, "1");
158 arg_desc->AddDefaultKey("seq_acc_count", "SequentialAccCount",
159 "size of spans of sequential accessions",
160 CArgDescriptions::eInteger, "10");
161
162 arg_desc->AddDefaultKey("o", "OutputFile",
163 "Output file of ASN.1",
164 CArgDescriptions::eOutputFile,
165 "-");
166
167 // Setup arg.descriptions for this application
168 SetupArgDescriptions(arg_desc.release());
169 }
170
171
sx_GetSeqData(const CBioseq & seq)172 string sx_GetSeqData(const CBioseq& seq)
173 {
174 CScope scope(*CObjectManager::GetInstance());
175 CBioseq_Handle bh = scope.AddBioseq(seq);
176 string ret;
177 bh.GetSeqVector().GetSeqData(0, kInvalidSeqPos, ret);
178 NON_CONST_ITERATE ( string, i, ret ) {
179 if ( *i == char(0xff) || *i == char(0) )
180 *i = char(0xf);
181 }
182 if ( 0 ) {
183 size_t w = 0;
184 ITERATE ( string, i, ret ) {
185 if ( w == 78 ) {
186 cout << '\n';
187 w = 0;
188 }
189 cout << "0123456789ABCDEF"[*i&0xff];
190 ++w;
191 }
192 cout << endl;
193 }
194 return ret;
195 }
196
197
198 /////////////////////////////////////////////////////////////////////////////
199 // Run test
200 /////////////////////////////////////////////////////////////////////////////
201
202 #if 0 // enable low-level SRA SDK test
203 # include <unistd.h>
204 # include <klib/rc.h>
205 # include <klib/writer.h>
206 # include <align/align-access.h>
207 # include <vdb/manager.h>
208 # include <vdb/database.h>
209 # include <vdb/table.h>
210 # include <vdb/cursor.h>
211 # include <vdb/vdb-priv.h>
212
213 // low level SRA SDK test
214 void CheckRc(rc_t rc, const char* code, const char* file, int line)
215 {
216 if ( rc ) {
217 char buffer1[4096];
218 size_t error_len;
219 RCExplain(rc, buffer1, sizeof(buffer1), &error_len);
220 char buffer2[8192];
221 size_t len = sprintf(buffer2, "%s:%d: %s failed: %#x: %s\n",
222 file, line, code, rc, buffer1);
223 write(2, buffer2, len);
224 exit(1);
225 }
226 }
227 #define CALL(call) CheckRc((call), #call, __FILE__, __LINE__)
228
229 struct SThreadInfo
230 {
231 pthread_t thread_id;
232 const VCursor* cursor;
233 vector<uint32_t> columns;
234 uint64_t row_start;
235 uint64_t row_end;
236 };
237
238 void* read_thread_func(void* arg)
239 {
240 SThreadInfo& info = *(SThreadInfo*)arg;
241 for ( int pass = 0; pass < 100; ++pass ) {
242 for ( uint64_t row = info.row_start; row < info.row_end; ++row ) {
243 for ( size_t i = 0; i < info.columns.size(); ++i ) {
244 const void* data;
245 uint32_t bit_offset, bit_length;
246 uint32_t elem_count;
247 CALL(VCursorCellDataDirect(info.cursor, row, info.columns[i],
248 &bit_length, &data, &bit_offset,
249 &elem_count));
250 }
251 }
252 }
253 return 0;
254 }
255
256 int LowLevelTest(void)
257 {
258 if ( 1 ) {
259 cout << "LowLevelTest for CSHW01000012.1 gap info..." << endl;
260 const VDBManager* mgr = 0;
261 CALL(VDBManagerMakeRead(&mgr, 0));
262
263 const VDatabase* db = 0;
264 CALL(VDBManagerOpenDBRead(mgr, &db, 0, "CSHW01"));
265
266 const VTable* table = 0;
267 CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
268
269 const VCursor* cursor = 0;
270 CALL(VTableCreateCursorRead(table, &cursor));
271 CALL(VCursorPermitPostOpenAdd(cursor));
272 CALL(VCursorOpen(cursor));
273
274 uint32_t col_GAP_PROPS;
275 CALL(VCursorAddColumn(cursor, &col_GAP_PROPS, "GAP_PROPS"));
276
277 const void* data;
278 uint32_t bit_offset, bit_length;
279 uint32_t elem_count;
280 CALL(VCursorCellDataDirect(cursor, 12, col_GAP_PROPS,
281 &bit_length, &data, &bit_offset,
282 &elem_count));
283 cout << "GAP_PROPS:";
284 for ( uint32_t i = 0; i < elem_count; ++i ) {
285 cout << ' ' << ((int16_t*)data)[i];
286 }
287 cout << endl;
288
289 CALL(VCursorRelease(cursor));
290 CALL(VTableRelease(table));
291 CALL(VDatabaseRelease(db));
292 CALL(VDBManagerRelease(mgr));
293 return 0;
294 }
295
296 if ( 1 ) {
297 cout << "LowLevelTest for AAAD01 opening..." << endl;
298 const VDBManager* mgr = 0;
299 CALL(VDBManagerMakeRead(&mgr, 0));
300
301 const VDatabase* db = 0;
302 CALL(VDBManagerOpenDBRead(mgr, &db, 0, "AAAD01"));
303
304 CALL(VDatabaseRelease(db));
305 CALL(VDBManagerRelease(mgr));
306 return 0;
307 }
308
309 if ( 0 ) {
310 cout << "LowLevelTest for multiple cursor opening..." << endl;
311 for ( int i = 0; i < 10; ++i ) {
312 cout << "Iteration " << i << endl;
313 const VDBManager* mgr = 0;
314 CALL(VDBManagerMakeRead(&mgr, 0));
315
316 const VDatabase* db = 0;
317 CALL(VDBManagerOpenDBRead(mgr, &db, 0, "GAMP01"));
318
319 const VTable* table = 0;
320 CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
321
322 const VCursor* cursor = 0;
323 CALL(VTableCreateCursorRead(table, &cursor));
324 CALL(VCursorPermitPostOpenAdd(cursor));
325 CALL(VCursorOpen(cursor));
326
327 uint32_t col_SEQ_ID, col_ACC_VERSION;
328 CALL(VCursorAddColumn(cursor, &col_SEQ_ID, "SEQ_ID"));
329 CALL(VCursorAddColumn(cursor, &col_ACC_VERSION, "ACC_VERSION"));
330
331 CALL(VCursorRelease(cursor));
332 CALL(VTableRelease(table));
333 CALL(VDatabaseRelease(db));
334 CALL(VDBManagerRelease(mgr));
335 }
336 }
337 if ( 0 ) {
338 cout << "LowLevelTest for MT cursor read..." << endl;
339 const VDBManager* mgr = 0;
340 CALL(VDBManagerMakeRead(&mgr, 0));
341
342 const VDatabase* db = 0;
343 //CALL(VDBManagerOpenDBRead(mgr, &db, 0, "GAMP01"));
344 CALL(VDBManagerOpenDBRead(mgr, &db, 0, "JRAS01"));
345 const size_t kNumCursors = 8;
346 const size_t kNumReads = 250/8;
347
348 const VTable* table = 0;
349 CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
350
351 SThreadInfo tinfo[kNumCursors];
352 for ( size_t i = 0; i < kNumCursors; ++i ) {
353 cout << "Create cursor " << i << endl;
354 CALL(VTableCreateCursorRead(table, &tinfo[i].cursor));
355 CALL(VCursorPermitPostOpenAdd(tinfo[i].cursor));
356 CALL(VCursorOpen(tinfo[i].cursor));
357
358 #define ADD_COLUMN(name) \
359 do { \
360 uint32_t column; \
361 CALL(VCursorAddColumn(tinfo[i].cursor, &column, name)); \
362 tinfo[i].columns.push_back(column); \
363 } while(0)
364
365 ADD_COLUMN("GI");
366 ADD_COLUMN("ACCESSION");
367 ADD_COLUMN("ACC_VERSION");
368 ADD_COLUMN("CONTIG_NAME");
369 ADD_COLUMN("NAME");
370 ADD_COLUMN("TITLE");
371 ADD_COLUMN("LABEL");
372 ADD_COLUMN("READ_START");
373 ADD_COLUMN("READ_LEN");
374 ADD_COLUMN("READ");
375 //ADD_COLUMN("SEQ_ID");
376 ADD_COLUMN("TAXID");
377 ADD_COLUMN("DESCR");
378 ADD_COLUMN("ANNOT");
379 ADD_COLUMN("GB_STATE");
380 ADD_COLUMN("GAP_START");
381 ADD_COLUMN("GAP_LEN");
382 ADD_COLUMN("GAP_PROPS");
383 ADD_COLUMN("GAP_LINKAGE");
384
385 #undef ADD_COLUMN
386 }
387 for ( size_t i = 0; i < kNumCursors; ++i ) {
388 cout << "Starting thread " << i << endl;
389 tinfo[i].row_start = 1+i*kNumReads;
390 tinfo[i].row_end = tinfo[i].row_start + kNumReads;
391 pthread_create(&tinfo[i].thread_id, 0, read_thread_func, &tinfo[i]);
392 }
393 for ( size_t i = 0; i < kNumCursors; ++i ) {
394 cout << "Waiting for thread " << i << endl;
395 void* ret = 0;
396 pthread_join(tinfo[i].thread_id, &ret);
397 }
398 for ( size_t i = 0; i < kNumCursors; ++i ) {
399 CALL(VCursorRelease(tinfo[i].cursor));
400 }
401 CALL(VTableRelease(table));
402 CALL(VDatabaseRelease(db));
403 CALL(VDBManagerRelease(mgr));
404 }
405 if ( 1 ) {
406 cout << "LowLevelTest sequence reading..." << endl;
407 for ( int i = 0; i < 1; ++i ) {
408 const VDBManager* mgr = 0;
409 CALL(VDBManagerMakeRead(&mgr, 0));
410
411 const VDatabase* db = 0;
412 CALL(VDBManagerOpenDBRead(mgr, &db, 0, "JELW01"));
413
414 const VTable* table = 0;
415 CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
416
417 const VCursor* cursor = 0;
418 CALL(VTableCreateCursorRead(table, &cursor));
419 CALL(VCursorPermitPostOpenAdd(cursor));
420 CALL(VCursorOpen(cursor));
421
422 char data0[10];
423 uint32_t total = 0;
424 const char* type = 0;
425 uint32_t bit_size = 0;
426 const size_t kBases = 1<<17;
427 char* buffer = new char[kBases];
428 uint32_t col;
429
430 uint64_t row0 = 1, row1 = 300;
431 CStopWatch sw(CStopWatch::eStart);
432 if ( 1 ) {
433 type = "packed 2na";
434 CALL(VCursorAddColumn(cursor, &col,
435 "(INSDC:2na:packed)READ"));
436 bit_size = 2;
437 }
438 if ( 0 ) {
439 type = "packed 4na";
440 CALL(VCursorAddColumn(cursor, &col,
441 "(INSDC:4na:packed)READ"));
442 bit_size = 4;
443 }
444 if ( 0 ) {
445 type = "byte 4na";
446 CALL(VCursorAddColumn(cursor, &col,
447 "(INSDC:4na:bin)READ"));
448 bit_size = 8;
449 }
450 const char* method = 0;
451 if ( 1 ) {
452 method = "in chunks";
453 for ( uint64_t row = row0; row <= row1; ++row ) {
454 uint32_t pos = 0, elem_read, elem_rem = 1;
455 for ( ; elem_rem; pos += elem_read ) {
456 CALL(VCursorReadBitsDirect(cursor, row, col,
457 bit_size, pos,
458 buffer, 0, kBases,
459 &elem_read, &elem_rem));
460 if ( row == 1 && pos == 0 ) memcpy(data0, buffer, 10);
461 }
462 total += pos;
463 }
464 }
465 else {
466 method = "in whole";
467 for ( uint64_t row = row0; row <= row1; ++row ) {
468 uint32_t bit_offset, bit_length, elem_count;
469 const void* data;
470 CALL(VCursorCellDataDirect(cursor, row, col,
471 &bit_length, &data, &bit_offset,
472 &elem_count));
473 assert(bit_length = bit_size);
474 if ( row == 1 ) memcpy(data0, data, 10);
475 total += elem_count;
476 }
477 }
478 double time = sw.Elapsed();
479 delete[] buffer;
480 cout << "read "<<type<<" "<<method
481 <<" time: "<<time<<" bases="<<total << endl;
482 cout << "data:" << hex;
483 for ( size_t i = 0; i < 10; ++i ) {
484 cout << ' ' << (data0[i]&0xff);
485 }
486 cout << dec << endl;
487
488 CALL(VCursorRelease(cursor));
489 CALL(VTableRelease(table));
490 CALL(VDatabaseRelease(db));
491 CALL(VDBManagerRelease(mgr));
492 }
493 }
494 cout << "LowLevelTest done" << endl;
495 return 0;
496 }
497 #endif
498
Run(void)499 int CWGSTestApp::Run(void)
500 {
501 #ifdef CALL
502 return LowLevelTest();
503 #endif
504
505 uint64_t error_count = 0;
506 const CArgs& args = GetArgs();
507
508 string path = args["file"].AsString();
509 bool verbose = args["verbose"];
510 bool print_seq = args["print_seq"];
511 bool print_entry = args["print_entry"];
512 bool print_split = args["print_split"];
513 size_t limit_count = 100;
514 if ( args["limit_count"] ) {
515 limit_count = size_t(args["limit_count"].AsInteger());
516 }
517
518 CNcbiOstream& out = args["o"].AsOutputFile();
519
520 CVDBMgr mgr;
521 CStopWatch sw;
522
523 sw.Restart();
524
525 if ( verbose ) {
526 try {
527 string acc = CWGSDb_Impl::NormalizePathOrAccession(path);
528 string resolved = mgr.FindAccPath(acc);
529 out << "Resolved "<<path<<" -> "<<acc<<" -> "<<resolved
530 << NcbiEndl;
531 }
532 catch ( CException& exc ) {
533 ERR_POST("FindAccPath failed: "<<exc);
534 }
535 }
536
537 bool is_component = false, is_scaffold = false, is_protein = false;
538 uint64_t row = 0;
539 int contig_version = -1;
540 if ( args["contig_version"] ) {
541 contig_version = args["contig_version"].AsInteger();
542 }
543 else if ( path.size() > 12 && path.find('/') == NPOS ) {
544 SIZE_TYPE dot_pos = path.rfind('.');
545 if ( dot_pos != NPOS &&
546 (contig_version = NStr::StringToNonNegativeInt(path.substr(dot_pos+1))) >= 0 ) {
547 path.resize(dot_pos);
548 }
549 }
550 if ( args["contig_row"] || args["scaffold_row"] || args["protein_row"] ) {
551 if ( args["contig_row"] ) {
552 row = args["contig_row"].AsInt8();
553 is_component = true;
554 }
555 else if ( args["scaffold_row"] ) {
556 row = args["scaffold_row"].AsInt8();
557 is_scaffold = true;
558 }
559 else if ( args["protein_row"] ) {
560 row = args["protein_row"].AsInt8();
561 is_protein = true;
562 }
563 }
564 else {
565 if ( !row && (row = CWGSDb::ParseContigRow(path)) ) {
566 is_component = true;
567 path = path.substr(0, 6);
568 }
569 if ( !row && (row = CWGSDb::ParseScaffoldRow(path)) ) {
570 is_scaffold = true;
571 path = path.substr(0, 6);
572 }
573 if ( !row && (row = CWGSDb::ParseProteinRow(path)) ) {
574 is_protein = true;
575 path = path.substr(0, 6);
576 }
577 }
578 if ( row && !args["limit_count"] ) {
579 limit_count = 1;
580 }
581
582 CWGSDb wgs_db(mgr, path);
583 if ( verbose ) {
584 out << "Opened WGS in "<<sw.Restart()
585 << NcbiEndl;
586 }
587 if ( wgs_db->GetProjectGBState() ) {
588 out << "WGS Project GB State: "<< wgs_db->GetProjectGBState() << NcbiEndl;
589 }
590 if ( wgs_db->IsReplaced() ) {
591 out << "WGS Project is replaced by "<< wgs_db->GetReplacedBy() << NcbiEndl;
592 }
593 if ( args["master"] ) {
594 CWGSDb::EDescrFilter filter = CWGSDb::eDescrDefaultFilter;
595 if ( args["master-no-filter"] ) {
596 filter = CWGSDb::eDescrNoFilter;
597 }
598 if ( !wgs_db.LoadMasterDescr(filter) ) {
599 ERR_POST("No master descriptors found");
600 }
601 }
602 if ( args["print-master"] ) {
603 if ( CRef<CSeq_entry> master = wgs_db->GetMasterSeq_entry() ) {
604 out << "Master " << MSerial_AsnText << *master;
605 }
606 else {
607 out << "No master entry" << NcbiEndl;
608 }
609 }
610
611 vector< CConstRef<CBioseq> > all_seqs;
612 bool keep_seqs = args["keep_sequences"];
613
614 CWGSSeqIterator::TIncludeFlags include_flags = CWGSSeqIterator::fIncludeLive;
615 if ( args["withdrawn"] ) {
616 include_flags |= CWGSSeqIterator::fIncludeWithdrawn;
617 }
618 if ( args["replaced"] ) {
619 include_flags |= CWGSSeqIterator::fIncludeReplaced;
620 }
621 if ( args["suppressed"] ) {
622 include_flags |= CWGSSeqIterator::fIncludeSuppressed;
623 }
624 if ( args["unverified"] ) {
625 include_flags |= CWGSSeqIterator::fIncludeUnverified;
626 }
627
628 if ( 1 ) {
629 CWGSSeqIterator it;
630 // try accession
631 if ( row ) {
632 // print only one accession
633 if ( is_component ) {
634 if ( !args["limit_count"] ) {
635 it = CWGSSeqIterator(wgs_db, row, include_flags);
636 }
637 else if ( row + limit_count < row ) {
638 it = CWGSSeqIterator(wgs_db, row, kMax_UI8, include_flags);
639 }
640 else {
641 it = CWGSSeqIterator(wgs_db, row, row+limit_count-1, include_flags);
642 }
643 if ( !it ) {
644 out << "No such row: "<<path
645 << NcbiEndl;
646 }
647 }
648 }
649 else {
650 // otherwise scan all sequences
651 it = CWGSSeqIterator(wgs_db, include_flags);
652 }
653 for ( size_t count = 0; it && count < limit_count; ++it, ++count ) {
654 if ( contig_version ) {
655 it.SelectAccVersion(contig_version);
656 }
657 out << it.GetAccession()<<'.'<<it.GetAccVersion();
658 if ( it.HasGi() ) {
659 out << " gi: "<<it.GetGi();
660 }
661 out << " len: "<<it.GetSeqLength();
662 if ( it.GetGBState() ) {
663 out << " gbstate: "<<it.GetGBState();
664 }
665 if ( it.HasSeqHash() ) {
666 out << " hash: 0x"<<hex<<it.GetSeqHash()<<dec;
667 }
668 if ( it.HasPublicComment() ) {
669 out << " comment: \""<<it.GetPublicComment()<<"\"";
670 }
671 out << '\n';
672 CRef<CBioseq> seq1 = it.GetBioseq();
673 CRef<CBioseq> seq2 = it.GetBioseq(it.fDefaultIds|it.fInst_ncbi4na);
674 if ( print_seq ) {
675 out << MSerial_AsnText << *seq1;
676 }
677 if ( print_entry ) {
678 out << MSerial_AsnText << *it.GetSeq_entry();
679 }
680 if ( print_split ) {
681 out << MSerial_AsnText << *it.GetSplitInfo();
682 }
683 string data1 = sx_GetSeqData(*seq1);
684 string data2 = sx_GetSeqData(*seq2);
685 if ( data1 != data2 ) {
686 size_t pos = 0;
687 while ( data1[pos] == data2[pos] ) {
688 ++pos;
689 }
690 ERR_POST(Fatal<<"Different Seq-data at " << pos << ": " <<
691 MSerial_AsnText << *seq1 << MSerial_AsnText << *seq2);
692 }
693 if ( keep_seqs ) {
694 all_seqs.push_back(seq1);
695 }
696 }
697 }
698
699 /*
700 if ( args["resolve-gi"] ) {
701 vector<string> gis_str;
702 NStr::Split(args["resolve-gi"].AsString(), ",", gis_str);
703 ITERATE ( vector<string>, it, gis_str ) {
704 TGi gi = NStr::StringToNumeric<TIntId>(*it);
705
706 }
707 }
708 */
709
710 if ( args["gi-check"] ) {
711 typedef map<TGi, uint64_t> TGiIdx;
712 TGiIdx nuc_idx;
713 for ( CWGSGiIterator it(wgs_db, CWGSGiIterator::eNuc); it; ++it ) {
714 nuc_idx[it.GetGi()] = it.GetRowId();
715 }
716 for ( CWGSSeqIterator it(wgs_db, include_flags); it; ++it ) {
717 if ( !it.HasGi() ) {
718 continue;
719 }
720 TGi gi = it.GetGi();
721 uint64_t row = it.GetCurrentRowId();
722 TGiIdx::iterator idx_it = nuc_idx.find(gi);
723 if ( idx_it == nuc_idx.end() ) {
724 ERR_POST("GI "<<gi<<" row="<<row<<" idx=none");
725 ++error_count;
726 }
727 else {
728 if ( idx_it->second != row ) {
729 ERR_POST("GI "<<gi<<" row="<<row<<" idx="<<idx_it->second);
730 ++error_count;
731 }
732 nuc_idx.erase(idx_it);
733 }
734 }
735 ITERATE ( TGiIdx, it, nuc_idx ) {
736 ERR_POST("GI "<<it->first<<" row=none"<<" idx="<<it->second);
737 ++error_count;
738 }
739 }
740 if ( args["gi-range"] ) {
741 pair<TGi, TGi> gi_range = wgs_db.GetNucGiRange();
742 if ( gi_range.second != ZERO_GI ) {
743 out << "Nucleotide GI range: "
744 << gi_range.first << " - " << gi_range.second
745 << NcbiEndl;
746 }
747 else {
748 out << "Nucleotide GI range is empty" << NcbiEndl;
749 }
750 gi_range = wgs_db.GetProtGiRange();
751 if ( gi_range.second != ZERO_GI ) {
752 out << "Protein GI range: "
753 << gi_range.first << " - " << gi_range.second
754 << NcbiEndl;
755 }
756 else {
757 out << "Protein GI range is empty" << NcbiEndl;
758 }
759 }
760 bool check_non_empty_lookup = args["check_non_empty_lookup"];
761 bool check_empty_lookup = args["check_empty_lookup"];
762 if ( args["gi"] ) {
763 TGi gi = GI_FROM(TIntId, args["gi"].AsIntId());
764 CRef<CWGSResolver> resolver = CWGSResolver_VDB::CreateResolver(mgr);
765 if ( resolver ) {
766 CWGSResolver::TWGSPrefixes prefixes = resolver->GetPrefixes(gi);
767 if ( prefixes.empty() ) {
768 out << "No WGS accessions with gi "<<gi<<NcbiEndl;
769 }
770 else {
771 ITERATE ( CWGSResolver::TWGSPrefixes, it, prefixes ) {
772 out << "GI "<<gi<<" is found in WGS " << *it << NcbiEndl;
773 }
774 }
775 }
776 CWGSGiIterator gi_it(wgs_db, gi);
777 if ( !gi_it ) {
778 out << "GI "<<gi<<" not found" << NcbiEndl;
779 if ( check_non_empty_lookup ) {
780 ++error_count;
781 }
782 }
783 else if ( gi_it.GetSeqType() == gi_it.eNuc ) {
784 out << "GI "<<gi<<" Nucleotide row: "<<gi_it.GetRowId()
785 << NcbiEndl;
786 if ( check_empty_lookup ) {
787 ++error_count;
788 }
789 CWGSSeqIterator it(wgs_db, gi_it.GetRowId(), include_flags);
790 if ( !it ) {
791 out << "No such row: "<< gi_it.GetRowId() << NcbiEndl;
792 ++error_count;
793 }
794 else {
795 out << "GI "<<gi<<" len: "<<it.GetSeqLength() << NcbiEndl;
796 if ( print_seq ) {
797 out << MSerial_AsnText << *it.GetBioseq();
798 }
799 if ( print_entry ) {
800 out << MSerial_AsnText << *it.GetSeq_entry();
801 }
802 }
803 }
804 else {
805 out << "GI "<<gi<<" Protein row: "<<gi_it.GetRowId() << NcbiEndl;
806 if ( check_empty_lookup ) {
807 ++error_count;
808 }
809 CWGSProteinIterator it(wgs_db, gi_it.GetRowId());
810 if ( !it ) {
811 out << "No such row: "<< gi_it.GetRowId() << NcbiEndl;
812 ++error_count;
813 }
814 else {
815 out << "GI "<<gi<<" len: "<<it.GetSeqLength();
816 if ( it.HasPublicComment() ) {
817 out << " comment: \""<<it.GetPublicComment()<<"\"";
818 }
819 out << NcbiEndl;
820
821 if ( print_seq ) {
822 out << MSerial_AsnText << *it.GetBioseq();
823 }
824 if ( print_entry ) {
825 out << MSerial_AsnText << *it.GetSeq_entry();
826 }
827 }
828 }
829 }
830 if ( args["contig_name"] ) {
831 string name = args["contig_name"].AsString();
832 uint64_t row_id = wgs_db.GetContigNameRowId(name);
833 out << "Contig name "<<name<<" is in CONTIG table row " << row_id
834 << NcbiEndl;
835 if ( !row_id ) {
836 if ( check_non_empty_lookup ) {
837 ++error_count;
838 }
839 }
840 else {
841 if ( check_empty_lookup ) {
842 ++error_count;
843 }
844 CWGSSeqIterator it(wgs_db, row_id, include_flags);
845 if ( !it ) {
846 out << "CONTIG: No such row: "<< row_id << NcbiEndl;
847 ++error_count;
848 }
849 else {
850 out << "CONTIG["<<row_id<<"] len: "<<it.GetSeqLength()
851 << " name: " << it.GetContigName()
852 << NcbiEndl;
853 if ( !NStr::EqualNocase(it.GetContigName(), name) ) {
854 out << "Name is different!" << NcbiEndl;
855 ++error_count;
856 }
857 if ( print_seq ) {
858 out << MSerial_AsnText << *it.GetBioseq();
859 }
860 if ( print_entry ) {
861 out << MSerial_AsnText << *it.GetSeq_entry();
862 }
863 }
864 }
865 }
866 if ( args["scaffold_name"] ) {
867 string name = args["scaffold_name"].AsString();
868 uint64_t row_id = wgs_db.GetScaffoldNameRowId(name);
869 out << "Scaffold name "<<name<<" is in SCAFFOLD table row " << row_id
870 << NcbiEndl;
871 if ( !row_id ) {
872 if ( check_non_empty_lookup ) {
873 ++error_count;
874 }
875 }
876 else {
877 if ( check_empty_lookup ) {
878 ++error_count;
879 }
880 CWGSScaffoldIterator it(wgs_db, row_id);
881 if ( !it ) {
882 out << "SCAFFOLD: No such row: "<< row_id << NcbiEndl;
883 ++error_count;
884 }
885 else {
886 out << "SCAFFOLD["<<row_id<<"] len: "<<it.GetSeqLength()
887 << " name: " << it.GetScaffoldName()
888 << NcbiEndl;
889 if ( !NStr::EqualNocase(it.GetScaffoldName(), name) ) {
890 out << "Name is different!" << NcbiEndl;
891 ++error_count;
892 }
893 if ( print_seq ) {
894 out << MSerial_AsnText << *it.GetBioseq();
895 }
896 if ( print_entry ) {
897 out << MSerial_AsnText << *it.GetSeq_entry();
898 }
899 if ( keep_seqs ) {
900 all_seqs.push_back(it.GetBioseq());
901 }
902 }
903 }
904 }
905 if ( args["protein_name"] ) {
906 string name = args["protein_name"].AsString();
907 uint64_t row_id = wgs_db.GetProteinNameRowId(name);
908 out << "Protein name "<<name<<" is in PROTEIN table row " << row_id
909 << NcbiEndl;
910 if ( !row_id ) {
911 if ( check_non_empty_lookup ) {
912 ++error_count;
913 }
914 }
915 else {
916 if ( check_empty_lookup ) {
917 ++error_count;
918 }
919 CWGSProteinIterator it(wgs_db, row_id);
920 if ( !it ) {
921 out << "PROTEIN: No such row: "<< row_id << NcbiEndl;
922 ++error_count;
923 }
924 else {
925 out << "PROTEIN["<<row_id<<"] len: "<<it.GetSeqLength()
926 << " name: " << it.GetProteinName()
927 << NcbiEndl;
928 if ( !NStr::EqualNocase(it.GetProteinName(), name) ) {
929 out << "Name is different!" << NcbiEndl;
930 ++error_count;
931 }
932 if ( print_seq ) {
933 out << MSerial_AsnText << *it.GetBioseq();
934 }
935 if ( print_entry ) {
936 out << MSerial_AsnText << *it.GetSeq_entry();
937 }
938 }
939 }
940 }
941 if ( args["protein_acc"] ) {
942 string param = args["protein_acc"].AsString();
943 unsigned random_count = 0;
944 try {
945 random_count = NStr::StringToNumeric<unsigned>(param);
946 }
947 catch ( CStringException& /*ignored*/ ) {
948 }
949 vector<string> accs;
950 if ( random_count ) {
951 unsigned seq_count = args["seq_acc_count"].AsInteger();
952 CRandom r(args["seed"].AsInteger());
953 for ( unsigned i = 0; i < random_count; ++i ) {
954 string prefix;
955 for ( int j = 0; j < 3; ++j ) {
956 prefix += char(r.GetRand('A', 'Z'));
957 }
958 unsigned start = r.GetRand(0, 99999);
959 unsigned count = min(min(100000-start, random_count-i), r.GetRand(1, seq_count));
960 for ( unsigned j = 0; j < count; ++j ) {
961 string s = NStr::IntToString(start+j);
962 s = string(5-s.size(), '0') + s;
963 accs.push_back(prefix+s);
964 }
965 i += count-1;
966 }
967 }
968 else {
969 NStr::Split(param, ",", accs);
970 }
971 unsigned found_count = 0;
972 CStopWatch sw(CStopWatch::eStart);
973 CRef<CWGSResolver> resolver = CWGSResolver_VDB::CreateResolver(mgr);
974 if ( resolver ) {
975 for ( auto& acc_ver : accs ) {
976 string acc, ver;
977 NStr::SplitInTwo(acc_ver, ".", acc, ver);
978 CWGSResolver::TWGSPrefixes prefixes = resolver->GetPrefixes(acc);
979 if ( random_count ) {
980 found_count += (prefixes.size()!=0);
981 }
982 else if ( prefixes.empty() ) {
983 out << "No WGS accessions with protein acc "<<acc<<NcbiEndl;
984 }
985 else {
986 ITERATE ( CWGSResolver::TWGSPrefixes, it, prefixes ) {
987 out << "Protein acc "<<acc<<" is found in WGS " << *it << NcbiEndl;
988 }
989 }
990 }
991 }
992 out << "Resolved "<<accs.size()<<" WGS accessions in " << sw.Restart() << "s" << endl;
993 if ( random_count ) {
994 out << "Found valid WGS accessions: "<<found_count<<endl;
995 }
996 if ( !random_count ) {
997 for ( auto& acc_ver : accs ) {
998 string acc, ver;
999 NStr::SplitInTwo(acc_ver, ".", acc, ver);
1000 int version = ver.empty()? -1: NStr::StringToNumeric<int>(ver);
1001 uint64_t row_id = wgs_db.GetProtAccRowId(acc, version);
1002 out << "Protein acc "<<acc_ver<<" is in PROTEIN table row " << row_id
1003 << NcbiEndl;
1004 if ( !row_id ) {
1005 if ( check_non_empty_lookup ) {
1006 ++error_count;
1007 }
1008 }
1009 else {
1010 if ( check_empty_lookup ) {
1011 ++error_count;
1012 }
1013 CWGSProteinIterator it(wgs_db, row_id);
1014 if ( !it ) {
1015 out << "PROTEIN: No such row: "<< row_id << NcbiEndl;
1016 ++error_count;
1017 }
1018 else {
1019 out << "PROTEIN["<<row_id<<"] len: "<<it.GetSeqLength()
1020 << " acc: " << it.GetAccession()<<"."<<it.GetAccVersion()
1021 << NcbiEndl;
1022 if ( !NStr::EqualNocase(it.GetAccession(), acc) ) {
1023 out << "Accession is different!" << NcbiEndl;
1024 ++error_count;
1025 }
1026 if ( version != -1 && it.GetAccVersion() != version ) {
1027 out << "Version is different!" << NcbiEndl;
1028 ++error_count;
1029 }
1030 if ( print_seq ) {
1031 out << MSerial_AsnText << *it.GetBioseq();
1032 }
1033 if ( print_entry ) {
1034 out << MSerial_AsnText << *it.GetSeq_entry();
1035 }
1036 }
1037 }
1038 }
1039 out << "Found "<<accs.size()<<" WGS accessions in " << sw.Restart() << "s" << endl;
1040 }
1041 }
1042
1043 if ( 1 ) {
1044 CWGSScaffoldIterator it;
1045 if ( row ) {
1046 if ( is_scaffold ) {
1047 it = CWGSScaffoldIterator(wgs_db, row);
1048 if ( !it ) {
1049 out << "No such scaffold row: "<<path
1050 << NcbiEndl;
1051 }
1052 }
1053 }
1054 else {
1055 it = CWGSScaffoldIterator(wgs_db);
1056 }
1057 for ( size_t count = 0; it && count < limit_count; ++it, ++count ) {
1058 out << it.GetScaffoldName() << '\n';
1059 CRef<CBioseq> seq = it.GetBioseq();
1060 if ( print_seq ) {
1061 out << MSerial_AsnText << *seq;
1062 }
1063 if ( print_entry ) {
1064 out << MSerial_AsnText << *it.GetSeq_entry();
1065 }
1066 }
1067 }
1068
1069 if ( 1 ) {
1070 CWGSProteinIterator it;
1071 if ( row ) {
1072 if ( is_protein ) {
1073 it = CWGSProteinIterator(wgs_db, row);
1074 if ( !it ) {
1075 out << "No such protein row: "<<path
1076 << NcbiEndl;
1077 }
1078 }
1079 }
1080 else {
1081 it = CWGSProteinIterator(wgs_db);
1082 }
1083 for ( size_t count = 0; it && count < limit_count; ++it, ++count ) {
1084 out << it.GetProteinName() << '\n';
1085 CRef<CBioseq> seq = it.GetBioseq();
1086 if ( print_seq ) {
1087 out << MSerial_AsnText << *seq;
1088 }
1089 if ( print_entry ) {
1090 out << MSerial_AsnText << *it.GetSeq_entry();
1091 }
1092 }
1093 }
1094
1095 if ( error_count ) {
1096 out << "Failure. Error count: "<<error_count<< NcbiEndl;
1097 return 1;
1098 }
1099 else {
1100 out << "Success." << NcbiEndl;
1101 return 0;
1102 }
1103 }
1104
1105
1106 /////////////////////////////////////////////////////////////////////////////
1107 // Cleanup
1108
1109
Exit(void)1110 void CWGSTestApp::Exit(void)
1111 {
1112 SetDiagStream(0);
1113 }
1114
1115
1116 /////////////////////////////////////////////////////////////////////////////
1117 // MAIN
1118
1119
main(int argc,const char * argv[])1120 int main(int argc, const char* argv[])
1121 {
1122 // Execute main application function
1123 return CWGSTestApp().AppMain(argc, argv);
1124 }
1125