1 /*  $Id: wgs_test.cpp 613849 2020-08-13 14:22:10Z vasilche $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Authors:  Eugene Vasilchenko
27  *
28  * File Description:
29  *   Sample test application for WGS reader
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbiapp.hpp>
35 #include <corelib/ncbifile.hpp>
36 #include <corelib/ncbi_system.hpp>
37 #include <sra/readers/sra/wgsread.hpp>
38 #include <sra/readers/sra/wgsresolver.hpp>
39 #include <sra/readers/sra/impl/wgsresolver_impl.hpp>
40 #include <sra/readers/ncbi_traces_path.hpp>
41 
42 #include <objects/general/general__.hpp>
43 #include <objects/seq/seq__.hpp>
44 #include <objects/seqset/seqset__.hpp>
45 #include <objects/seqalign/seqalign__.hpp>
46 #include <objects/seqres/seqres__.hpp>
47 #include <objects/seqsplit/ID2S_Split_Info.hpp>
48 
49 #include <objmgr/object_manager.hpp>
50 #include <objmgr/seq_vector.hpp>
51 
52 #include <serial/serial.hpp>
53 #include <serial/objostrasnb.hpp>
54 #include <serial/objistrasnb.hpp>
55 
56 #include <util/random_gen.hpp>
57 
58 #include <common/test_assert.h>  /* This header must go last */
59 
60 USING_NCBI_SCOPE;
61 USING_SCOPE(objects);
62 
63 /////////////////////////////////////////////////////////////////////////////
64 //  CWGSTestApp::
65 
66 
67 class CWGSTestApp : public CNcbiApplication
68 {
69 private:
70     virtual void Init(void);
71     virtual int  Run(void);
72     virtual void Exit(void);
73 };
74 
75 
76 /////////////////////////////////////////////////////////////////////////////
77 //  Init test
78 
Init(void)79 void CWGSTestApp::Init(void)
80 {
81     // Create command-line argument descriptions class
82     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
83 
84     // Specify USAGE context
85     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
86                               "wgs_test");
87 
88     arg_desc->AddOptionalKey("vol_path", "VolPath",
89                              "Search path for volumes",
90                              CArgDescriptions::eString);
91     arg_desc->AddDefaultKey("file", "File",
92                             "WGS file name, accession, or prefix",
93                             CArgDescriptions::eString,
94                             "AAAA");
95 
96     arg_desc->AddOptionalKey("limit_count", "LimitCount",
97                              "Number of entries to read (-1 : unlimited)",
98                              CArgDescriptions::eInteger);
99     arg_desc->AddFlag("verbose", "Print info about found data");
100 
101     arg_desc->AddFlag("withdrawn", "Include withdrawn sequences");
102     arg_desc->AddFlag("replaced", "Include replaced sequences");
103     arg_desc->AddFlag("suppressed", "Include suppressed sequences");
104     arg_desc->AddFlag("unverified", "Include unverified sequences");
105     arg_desc->AddFlag("master", "Include master descriptors if any");
106     arg_desc->AddFlag("print-master", "Print master entry");
107     arg_desc->AddFlag("master-no-filter", "Do not filter master descriptors");
108 
109     //arg_desc->AddFlag("resolve-gi", "Resolve gi list");
110 
111     arg_desc->AddFlag("gi-check", "Check GI index");
112     arg_desc->AddFlag("gi-range", "Print GI range if any");
113     arg_desc->AddDefaultKey("max-gap", "MaxGap",
114                             "max gap in a single gi range",
115                             CArgDescriptions::eInteger, "10000");
116 
117     arg_desc->AddFlag("print_seq", "Print loaded Bioseq objects");
118     arg_desc->AddFlag("print_entry", "Print loaded Seq-entry objects");
119     arg_desc->AddFlag("print_split", "Print loaded Split info object");
120 
121     arg_desc->AddFlag("keep_sequences", "Keep all sequences in memory");
122 
123     arg_desc->AddOptionalKey("contig_row", "ContigRow",
124                              "contig row to fetch",
125                              CArgDescriptions::eInt8);
126     arg_desc->AddOptionalKey("contig_version", "ContigVersion",
127                              "contig version to fetch",
128                              CArgDescriptions::eInteger);
129     arg_desc->AddOptionalKey("scaffold_row", "ScaffoldRow",
130                              "scaffold row to fetch",
131                              CArgDescriptions::eInt8);
132     arg_desc->AddOptionalKey("protein_row", "ProteinRow",
133                              "protein row to fetch",
134                              CArgDescriptions::eInt8);
135 
136     arg_desc->AddFlag("check_non_empty_lookup",
137                       "Check that lookup produce non-empty result");
138     arg_desc->AddFlag("check_empty_lookup",
139                       "Check that lookup produce empty result");
140     arg_desc->AddOptionalKey("gi", "GI",
141                              "lookup by GI",
142                              CArgDescriptions::eIntId);
143     arg_desc->AddOptionalKey("contig_name", "ContigName",
144                              "lookup by contig name",
145                              CArgDescriptions::eString);
146     arg_desc->AddOptionalKey("scaffold_name", "ScaffoldName",
147                              "lookup by scaffold name",
148                              CArgDescriptions::eString);
149     arg_desc->AddOptionalKey("protein_name", "ProteinName",
150                              "lookup by protein name",
151                              CArgDescriptions::eString);
152     arg_desc->AddOptionalKey("protein_acc", "ProteinAcc",
153                              "lookup by protein accession",
154                              CArgDescriptions::eString);
155     arg_desc->AddDefaultKey("seed", "RandomSeed",
156                             "Seed for random number generator",
157                             CArgDescriptions::eInteger, "1");
158     arg_desc->AddDefaultKey("seq_acc_count", "SequentialAccCount",
159                             "size of spans of sequential accessions",
160                             CArgDescriptions::eInteger, "10");
161 
162     arg_desc->AddDefaultKey("o", "OutputFile",
163                             "Output file of ASN.1",
164                             CArgDescriptions::eOutputFile,
165                             "-");
166 
167     // Setup arg.descriptions for this application
168     SetupArgDescriptions(arg_desc.release());
169 }
170 
171 
sx_GetSeqData(const CBioseq & seq)172 string sx_GetSeqData(const CBioseq& seq)
173 {
174     CScope scope(*CObjectManager::GetInstance());
175     CBioseq_Handle bh = scope.AddBioseq(seq);
176     string ret;
177     bh.GetSeqVector().GetSeqData(0, kInvalidSeqPos, ret);
178     NON_CONST_ITERATE ( string, i, ret ) {
179         if ( *i == char(0xff) || *i == char(0) )
180             *i = char(0xf);
181     }
182     if ( 0 ) {
183         size_t w = 0;
184         ITERATE ( string, i, ret ) {
185             if ( w == 78 ) {
186                 cout << '\n';
187                 w = 0;
188             }
189             cout << "0123456789ABCDEF"[*i&0xff];
190             ++w;
191         }
192         cout << endl;
193     }
194     return ret;
195 }
196 
197 
198 /////////////////////////////////////////////////////////////////////////////
199 //  Run test
200 /////////////////////////////////////////////////////////////////////////////
201 
202 #if 0 // enable low-level SRA SDK test
203 # include <unistd.h>
204 # include <klib/rc.h>
205 # include <klib/writer.h>
206 # include <align/align-access.h>
207 # include <vdb/manager.h>
208 # include <vdb/database.h>
209 # include <vdb/table.h>
210 # include <vdb/cursor.h>
211 # include <vdb/vdb-priv.h>
212 
213 // low level SRA SDK test
214 void CheckRc(rc_t rc, const char* code, const char* file, int line)
215 {
216     if ( rc ) {
217         char buffer1[4096];
218         size_t error_len;
219         RCExplain(rc, buffer1, sizeof(buffer1), &error_len);
220         char buffer2[8192];
221         size_t len = sprintf(buffer2, "%s:%d: %s failed: %#x: %s\n",
222                              file, line, code, rc, buffer1);
223         write(2, buffer2, len);
224         exit(1);
225     }
226 }
227 #define CALL(call) CheckRc((call), #call, __FILE__, __LINE__)
228 
229 struct SThreadInfo
230 {
231     pthread_t thread_id;
232     const VCursor* cursor;
233     vector<uint32_t> columns;
234     uint64_t row_start;
235     uint64_t row_end;
236 };
237 
238 void* read_thread_func(void* arg)
239 {
240     SThreadInfo& info = *(SThreadInfo*)arg;
241     for ( int pass = 0; pass < 100; ++pass ) {
242         for ( uint64_t row = info.row_start; row < info.row_end; ++row ) {
243             for ( size_t i = 0; i < info.columns.size(); ++i ) {
244                 const void* data;
245                 uint32_t bit_offset, bit_length;
246                 uint32_t elem_count;
247                 CALL(VCursorCellDataDirect(info.cursor, row, info.columns[i],
248                                            &bit_length, &data, &bit_offset,
249                                            &elem_count));
250             }
251         }
252     }
253     return 0;
254 }
255 
256 int LowLevelTest(void)
257 {
258     if ( 1 ) {
259         cout << "LowLevelTest for CSHW01000012.1 gap info..." << endl;
260         const VDBManager* mgr = 0;
261         CALL(VDBManagerMakeRead(&mgr, 0));
262 
263         const VDatabase* db = 0;
264         CALL(VDBManagerOpenDBRead(mgr, &db, 0, "CSHW01"));
265 
266         const VTable* table = 0;
267         CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
268 
269         const VCursor* cursor = 0;
270         CALL(VTableCreateCursorRead(table, &cursor));
271         CALL(VCursorPermitPostOpenAdd(cursor));
272         CALL(VCursorOpen(cursor));
273 
274         uint32_t col_GAP_PROPS;
275         CALL(VCursorAddColumn(cursor, &col_GAP_PROPS, "GAP_PROPS"));
276 
277         const void* data;
278         uint32_t bit_offset, bit_length;
279         uint32_t elem_count;
280         CALL(VCursorCellDataDirect(cursor, 12, col_GAP_PROPS,
281                                    &bit_length, &data, &bit_offset,
282                                    &elem_count));
283         cout << "GAP_PROPS:";
284         for ( uint32_t i = 0; i < elem_count; ++i ) {
285             cout << ' ' << ((int16_t*)data)[i];
286         }
287         cout << endl;
288 
289         CALL(VCursorRelease(cursor));
290         CALL(VTableRelease(table));
291         CALL(VDatabaseRelease(db));
292         CALL(VDBManagerRelease(mgr));
293         return 0;
294     }
295 
296     if ( 1 ) {
297         cout << "LowLevelTest for AAAD01 opening..." << endl;
298         const VDBManager* mgr = 0;
299         CALL(VDBManagerMakeRead(&mgr, 0));
300 
301         const VDatabase* db = 0;
302         CALL(VDBManagerOpenDBRead(mgr, &db, 0, "AAAD01"));
303 
304         CALL(VDatabaseRelease(db));
305         CALL(VDBManagerRelease(mgr));
306         return 0;
307     }
308 
309     if ( 0 ) {
310         cout << "LowLevelTest for multiple cursor opening..." << endl;
311         for ( int i = 0; i < 10; ++i ) {
312             cout << "Iteration " << i << endl;
313             const VDBManager* mgr = 0;
314             CALL(VDBManagerMakeRead(&mgr, 0));
315 
316             const VDatabase* db = 0;
317             CALL(VDBManagerOpenDBRead(mgr, &db, 0, "GAMP01"));
318 
319             const VTable* table = 0;
320             CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
321 
322             const VCursor* cursor = 0;
323             CALL(VTableCreateCursorRead(table, &cursor));
324             CALL(VCursorPermitPostOpenAdd(cursor));
325             CALL(VCursorOpen(cursor));
326 
327             uint32_t col_SEQ_ID, col_ACC_VERSION;
328             CALL(VCursorAddColumn(cursor, &col_SEQ_ID, "SEQ_ID"));
329             CALL(VCursorAddColumn(cursor, &col_ACC_VERSION, "ACC_VERSION"));
330 
331             CALL(VCursorRelease(cursor));
332             CALL(VTableRelease(table));
333             CALL(VDatabaseRelease(db));
334             CALL(VDBManagerRelease(mgr));
335         }
336     }
337     if ( 0 ) {
338         cout << "LowLevelTest for MT cursor read..." << endl;
339         const VDBManager* mgr = 0;
340         CALL(VDBManagerMakeRead(&mgr, 0));
341 
342         const VDatabase* db = 0;
343         //CALL(VDBManagerOpenDBRead(mgr, &db, 0, "GAMP01"));
344         CALL(VDBManagerOpenDBRead(mgr, &db, 0, "JRAS01"));
345         const size_t kNumCursors = 8;
346         const size_t kNumReads = 250/8;
347 
348         const VTable* table = 0;
349         CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
350 
351         SThreadInfo tinfo[kNumCursors];
352         for ( size_t i = 0; i < kNumCursors; ++i ) {
353             cout << "Create cursor " << i << endl;
354             CALL(VTableCreateCursorRead(table, &tinfo[i].cursor));
355             CALL(VCursorPermitPostOpenAdd(tinfo[i].cursor));
356             CALL(VCursorOpen(tinfo[i].cursor));
357 
358 #define ADD_COLUMN(name)                                                \
359             do {                                                        \
360                 uint32_t column;                                        \
361                 CALL(VCursorAddColumn(tinfo[i].cursor, &column, name)); \
362                 tinfo[i].columns.push_back(column);                     \
363             } while(0)
364 
365             ADD_COLUMN("GI");
366             ADD_COLUMN("ACCESSION");
367             ADD_COLUMN("ACC_VERSION");
368             ADD_COLUMN("CONTIG_NAME");
369             ADD_COLUMN("NAME");
370             ADD_COLUMN("TITLE");
371             ADD_COLUMN("LABEL");
372             ADD_COLUMN("READ_START");
373             ADD_COLUMN("READ_LEN");
374             ADD_COLUMN("READ");
375             //ADD_COLUMN("SEQ_ID");
376             ADD_COLUMN("TAXID");
377             ADD_COLUMN("DESCR");
378             ADD_COLUMN("ANNOT");
379             ADD_COLUMN("GB_STATE");
380             ADD_COLUMN("GAP_START");
381             ADD_COLUMN("GAP_LEN");
382             ADD_COLUMN("GAP_PROPS");
383             ADD_COLUMN("GAP_LINKAGE");
384 
385 #undef ADD_COLUMN
386         }
387         for ( size_t i = 0; i < kNumCursors; ++i ) {
388             cout << "Starting thread " << i << endl;
389             tinfo[i].row_start = 1+i*kNumReads;
390             tinfo[i].row_end = tinfo[i].row_start + kNumReads;
391             pthread_create(&tinfo[i].thread_id, 0, read_thread_func, &tinfo[i]);
392         }
393         for ( size_t i = 0; i < kNumCursors; ++i ) {
394             cout << "Waiting for thread " << i << endl;
395             void* ret = 0;
396             pthread_join(tinfo[i].thread_id, &ret);
397         }
398         for ( size_t i = 0; i < kNumCursors; ++i ) {
399             CALL(VCursorRelease(tinfo[i].cursor));
400         }
401         CALL(VTableRelease(table));
402         CALL(VDatabaseRelease(db));
403         CALL(VDBManagerRelease(mgr));
404     }
405     if ( 1 ) {
406         cout << "LowLevelTest sequence reading..." << endl;
407         for ( int i = 0; i < 1; ++i ) {
408             const VDBManager* mgr = 0;
409             CALL(VDBManagerMakeRead(&mgr, 0));
410 
411             const VDatabase* db = 0;
412             CALL(VDBManagerOpenDBRead(mgr, &db, 0, "JELW01"));
413 
414             const VTable* table = 0;
415             CALL(VDatabaseOpenTableRead(db, &table, "SEQUENCE"));
416 
417             const VCursor* cursor = 0;
418             CALL(VTableCreateCursorRead(table, &cursor));
419             CALL(VCursorPermitPostOpenAdd(cursor));
420             CALL(VCursorOpen(cursor));
421 
422             char data0[10];
423             uint32_t total = 0;
424             const char* type = 0;
425             uint32_t bit_size = 0;
426             const size_t kBases = 1<<17;
427             char* buffer = new char[kBases];
428             uint32_t col;
429 
430             uint64_t row0 = 1, row1 = 300;
431             CStopWatch sw(CStopWatch::eStart);
432             if ( 1 ) {
433                 type = "packed 2na";
434                 CALL(VCursorAddColumn(cursor, &col,
435                                       "(INSDC:2na:packed)READ"));
436                 bit_size = 2;
437             }
438             if ( 0 ) {
439                 type = "packed 4na";
440                 CALL(VCursorAddColumn(cursor, &col,
441                                       "(INSDC:4na:packed)READ"));
442                 bit_size = 4;
443             }
444             if ( 0 ) {
445                 type = "byte 4na";
446                 CALL(VCursorAddColumn(cursor, &col,
447                                       "(INSDC:4na:bin)READ"));
448                 bit_size = 8;
449             }
450             const char* method = 0;
451             if ( 1 ) {
452                 method = "in chunks";
453                 for ( uint64_t row = row0; row <= row1; ++row ) {
454                     uint32_t pos = 0, elem_read, elem_rem = 1;
455                     for ( ; elem_rem; pos += elem_read ) {
456                         CALL(VCursorReadBitsDirect(cursor, row, col,
457                                                    bit_size, pos,
458                                                    buffer, 0, kBases,
459                                                    &elem_read, &elem_rem));
460                         if ( row == 1 && pos == 0 ) memcpy(data0, buffer, 10);
461                     }
462                     total += pos;
463                 }
464             }
465             else {
466                 method = "in whole";
467                 for ( uint64_t row = row0; row <= row1; ++row ) {
468                     uint32_t bit_offset, bit_length, elem_count;
469                     const void* data;
470                     CALL(VCursorCellDataDirect(cursor, row, col,
471                                                &bit_length, &data, &bit_offset,
472                                                &elem_count));
473                     assert(bit_length = bit_size);
474                     if ( row == 1 ) memcpy(data0, data, 10);
475                     total += elem_count;
476                 }
477             }
478             double time = sw.Elapsed();
479             delete[] buffer;
480             cout << "read "<<type<<" "<<method
481                  <<" time: "<<time<<" bases="<<total << endl;
482             cout << "data:" << hex;
483             for ( size_t i = 0; i < 10; ++i ) {
484                 cout << ' ' << (data0[i]&0xff);
485             }
486             cout << dec << endl;
487 
488             CALL(VCursorRelease(cursor));
489             CALL(VTableRelease(table));
490             CALL(VDatabaseRelease(db));
491             CALL(VDBManagerRelease(mgr));
492         }
493     }
494     cout << "LowLevelTest done" << endl;
495     return 0;
496 }
497 #endif
498 
Run(void)499 int CWGSTestApp::Run(void)
500 {
501 #ifdef CALL
502     return LowLevelTest();
503 #endif
504 
505     uint64_t error_count = 0;
506     const CArgs& args = GetArgs();
507 
508     string path = args["file"].AsString();
509     bool verbose = args["verbose"];
510     bool print_seq = args["print_seq"];
511     bool print_entry = args["print_entry"];
512     bool print_split = args["print_split"];
513     size_t limit_count = 100;
514     if ( args["limit_count"] ) {
515         limit_count = size_t(args["limit_count"].AsInteger());
516     }
517 
518     CNcbiOstream& out = args["o"].AsOutputFile();
519 
520     CVDBMgr mgr;
521     CStopWatch sw;
522 
523     sw.Restart();
524 
525     if ( verbose ) {
526         try {
527             string acc = CWGSDb_Impl::NormalizePathOrAccession(path);
528             string resolved = mgr.FindAccPath(acc);
529             out << "Resolved "<<path<<" -> "<<acc<<" -> "<<resolved
530                 << NcbiEndl;
531         }
532         catch ( CException& exc ) {
533             ERR_POST("FindAccPath failed: "<<exc);
534         }
535     }
536 
537     bool is_component = false, is_scaffold = false, is_protein = false;
538     uint64_t row = 0;
539     int contig_version = -1;
540     if ( args["contig_version"] ) {
541         contig_version = args["contig_version"].AsInteger();
542     }
543     else if ( path.size() > 12 && path.find('/') == NPOS ) {
544         SIZE_TYPE dot_pos = path.rfind('.');
545         if ( dot_pos != NPOS &&
546              (contig_version = NStr::StringToNonNegativeInt(path.substr(dot_pos+1))) >= 0 ) {
547             path.resize(dot_pos);
548         }
549     }
550     if ( args["contig_row"] || args["scaffold_row"] || args["protein_row"] ) {
551         if ( args["contig_row"] ) {
552             row = args["contig_row"].AsInt8();
553             is_component = true;
554         }
555         else if ( args["scaffold_row"] ) {
556             row = args["scaffold_row"].AsInt8();
557             is_scaffold = true;
558         }
559         else if ( args["protein_row"] ) {
560             row = args["protein_row"].AsInt8();
561             is_protein = true;
562         }
563     }
564     else {
565         if ( !row && (row = CWGSDb::ParseContigRow(path)) ) {
566             is_component = true;
567             path = path.substr(0, 6);
568         }
569         if ( !row && (row = CWGSDb::ParseScaffoldRow(path)) ) {
570             is_scaffold = true;
571             path = path.substr(0, 6);
572         }
573         if ( !row && (row = CWGSDb::ParseProteinRow(path)) ) {
574             is_protein = true;
575             path = path.substr(0, 6);
576         }
577     }
578     if ( row && !args["limit_count"] ) {
579         limit_count = 1;
580     }
581 
582     CWGSDb wgs_db(mgr, path);
583     if ( verbose ) {
584         out << "Opened WGS in "<<sw.Restart()
585             << NcbiEndl;
586     }
587     if ( wgs_db->GetProjectGBState() ) {
588         out << "WGS Project GB State: "<< wgs_db->GetProjectGBState() << NcbiEndl;
589     }
590     if ( wgs_db->IsReplaced() ) {
591         out << "WGS Project is replaced by "<< wgs_db->GetReplacedBy() << NcbiEndl;
592     }
593     if ( args["master"] ) {
594         CWGSDb::EDescrFilter filter = CWGSDb::eDescrDefaultFilter;
595         if ( args["master-no-filter"] ) {
596             filter = CWGSDb::eDescrNoFilter;
597         }
598         if ( !wgs_db.LoadMasterDescr(filter) ) {
599             ERR_POST("No master descriptors found");
600         }
601     }
602     if ( args["print-master"] ) {
603         if ( CRef<CSeq_entry> master = wgs_db->GetMasterSeq_entry() ) {
604             out << "Master " << MSerial_AsnText << *master;
605         }
606         else {
607             out << "No master entry" << NcbiEndl;
608         }
609     }
610 
611     vector< CConstRef<CBioseq> > all_seqs;
612     bool keep_seqs = args["keep_sequences"];
613 
614     CWGSSeqIterator::TIncludeFlags include_flags = CWGSSeqIterator::fIncludeLive;
615     if ( args["withdrawn"] ) {
616         include_flags |= CWGSSeqIterator::fIncludeWithdrawn;
617     }
618     if ( args["replaced"] ) {
619         include_flags |= CWGSSeqIterator::fIncludeReplaced;
620     }
621     if ( args["suppressed"] ) {
622         include_flags |= CWGSSeqIterator::fIncludeSuppressed;
623     }
624     if ( args["unverified"] ) {
625         include_flags |= CWGSSeqIterator::fIncludeUnverified;
626     }
627 
628     if ( 1 ) {
629         CWGSSeqIterator it;
630         // try accession
631         if ( row ) {
632             // print only one accession
633             if ( is_component ) {
634                 if ( !args["limit_count"] ) {
635                     it = CWGSSeqIterator(wgs_db, row, include_flags);
636                 }
637                 else if ( row + limit_count < row ) {
638                     it = CWGSSeqIterator(wgs_db, row, kMax_UI8, include_flags);
639                 }
640                 else {
641                     it = CWGSSeqIterator(wgs_db, row, row+limit_count-1, include_flags);
642                 }
643                 if ( !it ) {
644                     out << "No such row: "<<path
645                         << NcbiEndl;
646                 }
647             }
648         }
649         else {
650             // otherwise scan all sequences
651             it = CWGSSeqIterator(wgs_db, include_flags);
652         }
653         for ( size_t count = 0; it && count < limit_count; ++it, ++count ) {
654             if ( contig_version ) {
655                 it.SelectAccVersion(contig_version);
656             }
657             out << it.GetAccession()<<'.'<<it.GetAccVersion();
658             if ( it.HasGi() ) {
659                 out << " gi: "<<it.GetGi();
660             }
661             out << " len: "<<it.GetSeqLength();
662             if ( it.GetGBState() ) {
663                 out << " gbstate: "<<it.GetGBState();
664             }
665             if ( it.HasSeqHash() ) {
666                 out << " hash: 0x"<<hex<<it.GetSeqHash()<<dec;
667             }
668             if ( it.HasPublicComment() ) {
669                 out << " comment: \""<<it.GetPublicComment()<<"\"";
670             }
671             out << '\n';
672             CRef<CBioseq> seq1 = it.GetBioseq();
673             CRef<CBioseq> seq2 = it.GetBioseq(it.fDefaultIds|it.fInst_ncbi4na);
674             if ( print_seq ) {
675                 out << MSerial_AsnText << *seq1;
676             }
677             if ( print_entry ) {
678                 out << MSerial_AsnText << *it.GetSeq_entry();
679             }
680             if ( print_split ) {
681                 out << MSerial_AsnText << *it.GetSplitInfo();
682             }
683             string data1 = sx_GetSeqData(*seq1);
684             string data2 = sx_GetSeqData(*seq2);
685             if ( data1 != data2 ) {
686                 size_t pos = 0;
687                 while ( data1[pos] == data2[pos] ) {
688                     ++pos;
689                 }
690                 ERR_POST(Fatal<<"Different Seq-data at " << pos << ": " <<
691                          MSerial_AsnText << *seq1 << MSerial_AsnText << *seq2);
692             }
693             if ( keep_seqs ) {
694                 all_seqs.push_back(seq1);
695             }
696         }
697     }
698 
699     /*
700       if ( args["resolve-gi"] ) {
701       vector<string> gis_str;
702       NStr::Split(args["resolve-gi"].AsString(), ",", gis_str);
703       ITERATE ( vector<string>, it, gis_str ) {
704       TGi gi = NStr::StringToNumeric<TIntId>(*it);
705 
706       }
707       }
708     */
709 
710     if ( args["gi-check"] ) {
711         typedef map<TGi, uint64_t> TGiIdx;
712         TGiIdx nuc_idx;
713         for ( CWGSGiIterator it(wgs_db, CWGSGiIterator::eNuc); it; ++it ) {
714             nuc_idx[it.GetGi()] = it.GetRowId();
715         }
716         for ( CWGSSeqIterator it(wgs_db, include_flags); it; ++it ) {
717             if ( !it.HasGi() ) {
718                 continue;
719             }
720             TGi gi = it.GetGi();
721             uint64_t row = it.GetCurrentRowId();
722             TGiIdx::iterator idx_it = nuc_idx.find(gi);
723             if ( idx_it == nuc_idx.end() ) {
724                 ERR_POST("GI "<<gi<<" row="<<row<<" idx=none");
725                 ++error_count;
726             }
727             else {
728                 if ( idx_it->second != row ) {
729                     ERR_POST("GI "<<gi<<" row="<<row<<" idx="<<idx_it->second);
730                     ++error_count;
731                 }
732                 nuc_idx.erase(idx_it);
733             }
734         }
735         ITERATE ( TGiIdx, it, nuc_idx ) {
736             ERR_POST("GI "<<it->first<<" row=none"<<" idx="<<it->second);
737             ++error_count;
738         }
739     }
740     if ( args["gi-range"] ) {
741         pair<TGi, TGi> gi_range = wgs_db.GetNucGiRange();
742         if ( gi_range.second != ZERO_GI ) {
743             out << "Nucleotide GI range: "
744                 << gi_range.first << " - " << gi_range.second
745                 << NcbiEndl;
746         }
747         else {
748             out << "Nucleotide GI range is empty" << NcbiEndl;
749         }
750         gi_range = wgs_db.GetProtGiRange();
751         if ( gi_range.second != ZERO_GI ) {
752             out << "Protein GI range: "
753                 << gi_range.first << " - " << gi_range.second
754                 << NcbiEndl;
755         }
756         else {
757             out << "Protein GI range is empty" << NcbiEndl;
758         }
759     }
760     bool check_non_empty_lookup = args["check_non_empty_lookup"];
761     bool check_empty_lookup = args["check_empty_lookup"];
762     if ( args["gi"] ) {
763         TGi gi = GI_FROM(TIntId, args["gi"].AsIntId());
764         CRef<CWGSResolver> resolver = CWGSResolver_VDB::CreateResolver(mgr);
765         if ( resolver ) {
766             CWGSResolver::TWGSPrefixes prefixes = resolver->GetPrefixes(gi);
767             if ( prefixes.empty() ) {
768                 out << "No WGS accessions with gi "<<gi<<NcbiEndl;
769             }
770             else {
771                 ITERATE ( CWGSResolver::TWGSPrefixes, it, prefixes ) {
772                     out << "GI "<<gi<<" is found in WGS " << *it << NcbiEndl;
773                 }
774             }
775         }
776         CWGSGiIterator gi_it(wgs_db, gi);
777         if ( !gi_it ) {
778             out << "GI "<<gi<<" not found" << NcbiEndl;
779             if ( check_non_empty_lookup ) {
780                 ++error_count;
781             }
782         }
783         else if ( gi_it.GetSeqType() == gi_it.eNuc ) {
784             out << "GI "<<gi<<" Nucleotide row: "<<gi_it.GetRowId()
785                 << NcbiEndl;
786             if ( check_empty_lookup ) {
787                 ++error_count;
788             }
789             CWGSSeqIterator it(wgs_db, gi_it.GetRowId(), include_flags);
790             if ( !it ) {
791                 out << "No such row: "<< gi_it.GetRowId() << NcbiEndl;
792                 ++error_count;
793             }
794             else {
795                 out << "GI "<<gi<<" len: "<<it.GetSeqLength() << NcbiEndl;
796                 if ( print_seq ) {
797                     out << MSerial_AsnText << *it.GetBioseq();
798                 }
799                 if ( print_entry ) {
800                     out << MSerial_AsnText << *it.GetSeq_entry();
801                 }
802             }
803         }
804         else {
805             out << "GI "<<gi<<" Protein row: "<<gi_it.GetRowId() << NcbiEndl;
806             if ( check_empty_lookup ) {
807                 ++error_count;
808             }
809             CWGSProteinIterator it(wgs_db, gi_it.GetRowId());
810             if ( !it ) {
811                 out << "No such row: "<< gi_it.GetRowId() << NcbiEndl;
812                 ++error_count;
813             }
814             else {
815                 out << "GI "<<gi<<" len: "<<it.GetSeqLength();
816                 if ( it.HasPublicComment() ) {
817                     out << " comment: \""<<it.GetPublicComment()<<"\"";
818                 }
819                 out << NcbiEndl;
820 
821                 if ( print_seq ) {
822                     out << MSerial_AsnText << *it.GetBioseq();
823                 }
824                 if ( print_entry ) {
825                     out << MSerial_AsnText << *it.GetSeq_entry();
826                 }
827             }
828         }
829     }
830     if ( args["contig_name"] ) {
831         string name = args["contig_name"].AsString();
832         uint64_t row_id = wgs_db.GetContigNameRowId(name);
833         out << "Contig name "<<name<<" is in CONTIG table row " << row_id
834             << NcbiEndl;
835         if ( !row_id ) {
836             if ( check_non_empty_lookup ) {
837                 ++error_count;
838             }
839         }
840         else {
841             if ( check_empty_lookup ) {
842                 ++error_count;
843             }
844             CWGSSeqIterator it(wgs_db, row_id, include_flags);
845             if ( !it ) {
846                 out << "CONTIG: No such row: "<< row_id << NcbiEndl;
847                 ++error_count;
848             }
849             else {
850                 out << "CONTIG["<<row_id<<"] len: "<<it.GetSeqLength()
851                     << " name: " << it.GetContigName()
852                     << NcbiEndl;
853                 if ( !NStr::EqualNocase(it.GetContigName(), name) ) {
854                     out << "Name is different!" << NcbiEndl;
855                     ++error_count;
856                 }
857                 if ( print_seq ) {
858                     out << MSerial_AsnText << *it.GetBioseq();
859                 }
860                 if ( print_entry ) {
861                     out << MSerial_AsnText << *it.GetSeq_entry();
862                 }
863             }
864         }
865     }
866     if ( args["scaffold_name"] ) {
867         string name = args["scaffold_name"].AsString();
868         uint64_t row_id = wgs_db.GetScaffoldNameRowId(name);
869         out << "Scaffold name "<<name<<" is in SCAFFOLD table row " << row_id
870             << NcbiEndl;
871         if ( !row_id ) {
872             if ( check_non_empty_lookup ) {
873                 ++error_count;
874             }
875         }
876         else {
877             if ( check_empty_lookup ) {
878                 ++error_count;
879             }
880             CWGSScaffoldIterator it(wgs_db, row_id);
881             if ( !it ) {
882                 out << "SCAFFOLD: No such row: "<< row_id << NcbiEndl;
883                 ++error_count;
884             }
885             else {
886                 out << "SCAFFOLD["<<row_id<<"] len: "<<it.GetSeqLength()
887                     << " name: " << it.GetScaffoldName()
888                     << NcbiEndl;
889                 if ( !NStr::EqualNocase(it.GetScaffoldName(), name) ) {
890                     out << "Name is different!" << NcbiEndl;
891                     ++error_count;
892                 }
893                 if ( print_seq ) {
894                     out << MSerial_AsnText << *it.GetBioseq();
895                 }
896                 if ( print_entry ) {
897                     out << MSerial_AsnText << *it.GetSeq_entry();
898                 }
899                 if ( keep_seqs ) {
900                     all_seqs.push_back(it.GetBioseq());
901                 }
902             }
903         }
904     }
905     if ( args["protein_name"] ) {
906         string name = args["protein_name"].AsString();
907         uint64_t row_id = wgs_db.GetProteinNameRowId(name);
908         out << "Protein name "<<name<<" is in PROTEIN table row " << row_id
909             << NcbiEndl;
910         if ( !row_id ) {
911             if ( check_non_empty_lookup ) {
912                 ++error_count;
913             }
914         }
915         else {
916             if ( check_empty_lookup ) {
917                 ++error_count;
918             }
919             CWGSProteinIterator it(wgs_db, row_id);
920             if ( !it ) {
921                 out << "PROTEIN: No such row: "<< row_id << NcbiEndl;
922                 ++error_count;
923             }
924             else {
925                 out << "PROTEIN["<<row_id<<"] len: "<<it.GetSeqLength()
926                     << " name: " << it.GetProteinName()
927                     << NcbiEndl;
928                 if ( !NStr::EqualNocase(it.GetProteinName(), name) ) {
929                     out << "Name is different!" << NcbiEndl;
930                     ++error_count;
931                 }
932                 if ( print_seq ) {
933                     out << MSerial_AsnText << *it.GetBioseq();
934                 }
935                 if ( print_entry ) {
936                     out << MSerial_AsnText << *it.GetSeq_entry();
937                 }
938             }
939         }
940     }
941     if ( args["protein_acc"] ) {
942         string param = args["protein_acc"].AsString();
943         unsigned random_count = 0;
944         try {
945             random_count = NStr::StringToNumeric<unsigned>(param);
946         }
947         catch ( CStringException& /*ignored*/ ) {
948         }
949         vector<string> accs;
950         if ( random_count ) {
951             unsigned seq_count = args["seq_acc_count"].AsInteger();
952             CRandom r(args["seed"].AsInteger());
953             for ( unsigned i = 0; i < random_count; ++i ) {
954                 string prefix;
955                 for ( int j = 0; j < 3; ++j ) {
956                     prefix += char(r.GetRand('A', 'Z'));
957                 }
958                 unsigned start = r.GetRand(0, 99999);
959                 unsigned count = min(min(100000-start, random_count-i), r.GetRand(1, seq_count));
960                 for ( unsigned j = 0; j < count; ++j ) {
961                     string s = NStr::IntToString(start+j);
962                     s = string(5-s.size(), '0') + s;
963                     accs.push_back(prefix+s);
964                 }
965                 i += count-1;
966             }
967         }
968         else {
969             NStr::Split(param, ",", accs);
970         }
971         unsigned found_count = 0;
972         CStopWatch sw(CStopWatch::eStart);
973         CRef<CWGSResolver> resolver = CWGSResolver_VDB::CreateResolver(mgr);
974         if ( resolver ) {
975             for ( auto& acc_ver : accs ) {
976                 string acc, ver;
977                 NStr::SplitInTwo(acc_ver, ".", acc, ver);
978                 CWGSResolver::TWGSPrefixes prefixes = resolver->GetPrefixes(acc);
979                 if ( random_count ) {
980                     found_count += (prefixes.size()!=0);
981                 }
982                 else if ( prefixes.empty() ) {
983                     out << "No WGS accessions with protein acc "<<acc<<NcbiEndl;
984                 }
985                 else {
986                     ITERATE ( CWGSResolver::TWGSPrefixes, it, prefixes ) {
987                         out << "Protein acc "<<acc<<" is found in WGS " << *it << NcbiEndl;
988                     }
989                 }
990             }
991         }
992         out << "Resolved "<<accs.size()<<" WGS accessions in " << sw.Restart() << "s" << endl;
993         if ( random_count ) {
994             out << "Found valid WGS accessions: "<<found_count<<endl;
995         }
996         if ( !random_count ) {
997             for ( auto& acc_ver : accs ) {
998                 string acc, ver;
999                 NStr::SplitInTwo(acc_ver, ".", acc, ver);
1000                 int version = ver.empty()? -1: NStr::StringToNumeric<int>(ver);
1001                 uint64_t row_id = wgs_db.GetProtAccRowId(acc, version);
1002                 out << "Protein acc "<<acc_ver<<" is in PROTEIN table row " << row_id
1003                     << NcbiEndl;
1004                 if ( !row_id ) {
1005                     if ( check_non_empty_lookup ) {
1006                         ++error_count;
1007                     }
1008                 }
1009                 else {
1010                     if ( check_empty_lookup ) {
1011                         ++error_count;
1012                     }
1013                     CWGSProteinIterator it(wgs_db, row_id);
1014                     if ( !it ) {
1015                         out << "PROTEIN: No such row: "<< row_id << NcbiEndl;
1016                         ++error_count;
1017                     }
1018                     else {
1019                         out << "PROTEIN["<<row_id<<"] len: "<<it.GetSeqLength()
1020                             << " acc: " << it.GetAccession()<<"."<<it.GetAccVersion()
1021                             << NcbiEndl;
1022                         if ( !NStr::EqualNocase(it.GetAccession(), acc) ) {
1023                             out << "Accession is different!" << NcbiEndl;
1024                             ++error_count;
1025                         }
1026                         if ( version != -1 && it.GetAccVersion() != version ) {
1027                             out << "Version is different!" << NcbiEndl;
1028                             ++error_count;
1029                         }
1030                         if ( print_seq ) {
1031                             out << MSerial_AsnText << *it.GetBioseq();
1032                         }
1033                         if ( print_entry ) {
1034                             out << MSerial_AsnText << *it.GetSeq_entry();
1035                         }
1036                     }
1037                 }
1038             }
1039             out << "Found "<<accs.size()<<" WGS accessions in " << sw.Restart() << "s" << endl;
1040         }
1041     }
1042 
1043     if ( 1 ) {
1044         CWGSScaffoldIterator it;
1045         if ( row ) {
1046             if ( is_scaffold ) {
1047                 it = CWGSScaffoldIterator(wgs_db, row);
1048                 if ( !it ) {
1049                     out << "No such scaffold row: "<<path
1050                         << NcbiEndl;
1051                 }
1052             }
1053         }
1054         else {
1055             it = CWGSScaffoldIterator(wgs_db);
1056         }
1057         for ( size_t count = 0; it && count < limit_count; ++it, ++count ) {
1058             out << it.GetScaffoldName() << '\n';
1059             CRef<CBioseq> seq = it.GetBioseq();
1060             if ( print_seq ) {
1061                 out << MSerial_AsnText << *seq;
1062             }
1063             if ( print_entry ) {
1064                 out << MSerial_AsnText << *it.GetSeq_entry();
1065             }
1066         }
1067     }
1068 
1069     if ( 1 ) {
1070         CWGSProteinIterator it;
1071         if ( row ) {
1072             if ( is_protein ) {
1073                 it = CWGSProteinIterator(wgs_db, row);
1074                 if ( !it ) {
1075                     out << "No such protein row: "<<path
1076                         << NcbiEndl;
1077                 }
1078             }
1079         }
1080         else {
1081             it = CWGSProteinIterator(wgs_db);
1082         }
1083         for ( size_t count = 0; it && count < limit_count; ++it, ++count ) {
1084             out << it.GetProteinName() << '\n';
1085             CRef<CBioseq> seq = it.GetBioseq();
1086             if ( print_seq ) {
1087                 out << MSerial_AsnText << *seq;
1088             }
1089             if ( print_entry ) {
1090                 out << MSerial_AsnText << *it.GetSeq_entry();
1091             }
1092         }
1093     }
1094 
1095     if ( error_count ) {
1096         out << "Failure. Error count: "<<error_count<< NcbiEndl;
1097         return 1;
1098     }
1099     else {
1100         out << "Success." << NcbiEndl;
1101         return 0;
1102     }
1103 }
1104 
1105 
1106 /////////////////////////////////////////////////////////////////////////////
1107 //  Cleanup
1108 
1109 
Exit(void)1110 void CWGSTestApp::Exit(void)
1111 {
1112     SetDiagStream(0);
1113 }
1114 
1115 
1116 /////////////////////////////////////////////////////////////////////////////
1117 //  MAIN
1118 
1119 
main(int argc,const char * argv[])1120 int main(int argc, const char* argv[])
1121 {
1122     // Execute main application function
1123     return CWGSTestApp().AppMain(argc, argv);
1124 }
1125