1 /* $Id: asn2flat.cpp 637424 2021-09-13 13:12:39Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Aaron Ucko, Mati Shomrat, Mike DiCuccio, Jonathan Kans, NCBI
27 *
28 * File Description:
29 * flat-file generator application
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbi_signal.hpp>
38 #include <connect/ncbi_core_cxx.hpp>
39
40 #include <serial/serial.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/serial.hpp>
43
44 #include <objects/seqset/Seq_entry.hpp>
45 #include <objects/seqloc/Seq_loc.hpp>
46 #include <objects/seqloc/Seq_interval.hpp>
47 #include <objects/submit/Seq_submit.hpp>
48 #include <objects/seq/seq_macros.hpp>
49
50 #include <objmgr/object_manager.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/seq_entry_ci.hpp>
53 #include <objmgr/util/sequence.hpp>
54
55 #include <objtools/cleanup/cleanup.hpp>
56 #include <objtools/format/flat_file_config.hpp>
57 #include <objtools/format/flat_file_generator.hpp>
58 #include <objtools/format/flat_expt.hpp>
59 #include <objects/seqset/gb_release_file.hpp>
60
61 #include <util/compress/zlib.hpp>
62 #include <util/compress/stream.hpp>
63
64 #include <objmgr/util/objutil.hpp>
65
66 #include <misc/data_loaders_util/data_loaders_util.hpp>
67 #include <objtools/data_loaders/genbank/gbloader.hpp>
68
69 #define USE_CDDLOADER
70
71 #if defined(HAVE_LIBGRPC) && defined(HAVE_NCBI_VDB)
72 #define USE_SNPLOADER
73 #endif
74
75 #ifdef USE_SNPLOADER
76 // ID-5865 : For SNP retrieval in PSG mode via SNP data loader
77 # include <misc/grpc_integration/grpc_integration.hpp>
78 # include <grpc++/grpc++.h>
79 # include <objects/dbsnp/primary_track/dbsnp.grpc.pb.h>
80 # include <objects/dbsnp/primary_track/dbsnp.pb.h>
81 #endif
82
83 #ifdef USE_CDDLOADER
84 #include <sra/data_loaders/snp/snploader.hpp>
85 #include <objtools/data_loaders/cdd/cdd_loader/cdd_loader.hpp>
86 #endif
87
88
89
90 BEGIN_NCBI_SCOPE
91 USING_SCOPE(objects);
92
93 class CHTMLFormatterEx : public IHTMLFormatter
94 {
95 public:
96 CHTMLFormatterEx(CRef<CScope> scope);
97 ~CHTMLFormatterEx() override;
98
99 void FormatProteinId(string&, const CSeq_id& seq_id, const string& prot_id) const override;
100 void FormatTranscriptId(string &, const CSeq_id& seq_id, const string& nuc_id) const override;
101 void FormatNucSearch(CNcbiOstream& os, const string& id) const override;
102 void FormatNucId(string& str, const CSeq_id& seq_id, TIntId gi, const string& acc_id) const override;
103 void FormatTaxid(string& str, const TTaxId taxid, const string& taxname) const override;
104 void FormatLocation(string& str, const CSeq_loc& loc, TIntId gi, const string& visible_text) const override;
105 void FormatModelEvidence(string& str, const SModelEvidance& me) const override;
106 void FormatTranscript(string& str, const string& name) const override;
107 void FormatGeneralId(CNcbiOstream& os, const string& id) const override;
108 void FormatGapLink(CNcbiOstream& os, TSeqPos gap_size, const string& id, bool is_prot) const override;
109 void FormatUniProtId(string& str, const string& prot_id) const override;
110
111 private:
112 mutable CRef<CScope> m_scope;
113 };
114
115
CHTMLFormatterEx(CRef<CScope> scope)116 CHTMLFormatterEx::CHTMLFormatterEx(CRef<CScope> scope) : m_scope(scope)
117 {
118 }
119
~CHTMLFormatterEx()120 CHTMLFormatterEx::~CHTMLFormatterEx()
121 {
122 }
123
FormatProteinId(string & str,const CSeq_id & seq_id,const string & prot_id) const124 void CHTMLFormatterEx::FormatProteinId(string& str, const CSeq_id& seq_id, const string& prot_id) const
125 {
126 string index = prot_id;
127 CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id);
128 vector< CSeq_id_Handle > ids = bsh.GetId();
129 ITERATE(vector< CSeq_id_Handle >, it, ids) {
130 CSeq_id_Handle hid = *it;
131 if (hid.IsGi()) {
132 index = NStr::NumericToString(hid.GetGi());
133 break;
134 }
135 }
136 str = "<a href=\"";
137 str += strLinkBaseProt;
138 str += index;
139 str += "\">";
140 str += prot_id;
141 str += "</a>";
142 }
143
FormatTranscriptId(string & str,const CSeq_id & seq_id,const string & nuc_id) const144 void CHTMLFormatterEx::FormatTranscriptId(string& str, const CSeq_id& seq_id, const string& nuc_id) const
145 {
146 string index = nuc_id;
147 CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id);
148 vector< CSeq_id_Handle > ids = bsh.GetId();
149 ITERATE(vector< CSeq_id_Handle >, it, ids) {
150 CSeq_id_Handle hid = *it;
151 if (hid.IsGi()) {
152 index = NStr::NumericToString(hid.GetGi());
153 break;
154 }
155 }
156 str = "<a href=\"";
157 str += strLinkBaseNuc;
158 str += index;
159 str += "\">";
160 str += nuc_id;
161 str += "</a>";
162 }
163
FormatNucId(string & str,const CSeq_id & seq_id,TIntId gi,const string & acc_id) const164 void CHTMLFormatterEx::FormatNucId(string& str, const CSeq_id& seq_id, TIntId gi, const string& acc_id) const
165 {
166 str = "<a href=\"";
167 str += strLinkBaseNuc + NStr::NumericToString(gi) + "\">" + acc_id + "</a>";
168 }
169
FormatTaxid(string & str,const TTaxId taxid,const string & taxname) const170 void CHTMLFormatterEx::FormatTaxid(string& str, const TTaxId taxid, const string& taxname) const
171 {
172 if (!NStr::StartsWith(taxname, "Unknown", NStr::eNocase)) {
173 if (taxid > ZERO_TAX_ID) {
174 str += "<a href=\"";
175 str += strLinkBaseTaxonomy;
176 str += "id=";
177 str += NStr::NumericToString(taxid);
178 str += "\">";
179 }
180 else {
181 string t_taxname = taxname;
182 replace(t_taxname.begin(), t_taxname.end(), ' ', '+');
183 str += "<a href=\"";
184 str += strLinkBaseTaxonomy;
185 str += "name=";
186 str += taxname;
187 str += "\">";
188 }
189 str += taxname;
190 str += "</a>";
191 }
192 else {
193 str = taxname;
194 }
195
196 TryToSanitizeHtml(str);
197 }
198
FormatNucSearch(CNcbiOstream & os,const string & id) const199 void CHTMLFormatterEx::FormatNucSearch(CNcbiOstream& os, const string& id) const
200 {
201 os << "<a href=\"" << strLinkBaseNucSearch << id << "\">" << id << "</a>";
202 }
203
FormatLocation(string & strLink,const CSeq_loc & loc,TIntId gi,const string & visible_text) const204 void CHTMLFormatterEx::FormatLocation(string& strLink, const CSeq_loc& loc, TIntId gi, const string& visible_text) const
205 {
206 // check if this is a protein or nucleotide link
207 bool is_prot = false;
208 {{
209 CBioseq_Handle bioseq_h;
210 ITERATE(CSeq_loc, loc_ci, loc) {
211 bioseq_h = m_scope->GetBioseqHandle(loc_ci.GetSeq_id());
212 if (bioseq_h) {
213 break;
214 }
215 }
216 if (bioseq_h) {
217 is_prot = (bioseq_h.GetBioseqMolType() == CSeq_inst::eMol_aa);
218 }
219 }}
220
221 // assembly of the actual string:
222 strLink.reserve(100); // euristical URL length
223
224 strLink = "<a href=\"";
225
226 // link base
227 if (is_prot) {
228 strLink += strLinkBaseProt;
229 }
230 else {
231 strLink += strLinkBaseNuc;
232 }
233 strLink += NStr::NumericToString(gi);
234
235 // location
236 if (loc.IsInt() || loc.IsPnt()) {
237 TSeqPos iFrom = loc.GetStart(eExtreme_Positional) + 1;
238 TSeqPos iTo = loc.GetStop(eExtreme_Positional) + 1;
239 strLink += "?from=";
240 strLink += NStr::IntToString(iFrom);
241 strLink += "&to=";
242 strLink += NStr::IntToString(iTo);
243 }
244 else if (visible_text != "Precursor") {
245 // TODO: this fails on URLs that require "?itemID=" (e.g. almost any, such as U54469)
246 strLink += "?itemid=TBD";
247 }
248
249 strLink += "\">";
250 strLink += visible_text;
251 strLink += "</a>";
252 }
253
FormatModelEvidence(string & str,const SModelEvidance & me) const254 void CHTMLFormatterEx::FormatModelEvidence(string& str, const SModelEvidance& me) const
255 {
256 str += "<a href=\"";
257 str += strLinkBaseNuc;
258 if (me.gi > ZERO_GI) {
259 str += NStr::NumericToString(me.gi);
260 }
261 else {
262 str += me.name;
263 }
264 str += "?report=graph";
265 if ((me.span.first >= 0) && (me.span.second >= me.span.first)) {
266 const Int8 kPadAmount = 500;
267 // The "+1" is because we display 1-based to user and in URL
268 str += "&v=";
269 str += NStr::NumericToString(max<Int8>(me.span.first + 1 - kPadAmount, 1));
270 str += ":";
271 str += NStr::NumericToString(me.span.second + 1 + kPadAmount); // okay if second number goes over end of sequence
272 }
273 str += "\">";
274 str += me.name;
275 str += "</a>";
276 }
277
FormatTranscript(string & str,const string & name) const278 void CHTMLFormatterEx::FormatTranscript(string& str, const string& name) const
279 {
280 str += "<a href=\"";
281 str += strLinkBaseNuc;
282 str += name;
283 str += "\">";
284 str += name;
285 str += "</a>";
286 }
287
FormatGeneralId(CNcbiOstream & os,const string & id) const288 void CHTMLFormatterEx::FormatGeneralId(CNcbiOstream& os, const string& id) const
289 {
290 os << "<a href=\"" << strLinkBaseNuc << id << "\">" << id << "</a>";
291 }
292
FormatGapLink(CNcbiOstream & os,TSeqPos gap_size,const string & id,bool is_prot) const293 void CHTMLFormatterEx::FormatGapLink(CNcbiOstream& os, TSeqPos gap_size,
294 const string& id, bool is_prot) const
295 {
296 const string link_base = (is_prot ? strLinkBaseProt : strLinkBaseNuc);
297 const char *mol_type = (is_prot ? "aa" : "bp" );
298 os << " [gap " << gap_size << " " << mol_type << "]" <<
299 " <a href=\"" << link_base << id << "?expand-gaps=on\">Expand Ns</a>";
300 }
301
FormatUniProtId(string & str,const string & prot_id) const302 void CHTMLFormatterEx::FormatUniProtId(string& str, const string& prot_id) const
303 {
304 str = "<a href=\"";
305 str += strLinkBaseUniProt;
306 str += prot_id;
307 str += "\">";
308 str += prot_id;
309 str += "</a>";
310 }
311
312 class CAsn2FlatApp : public CNcbiApplication, public CGBReleaseFile::ISeqEntryHandler
313 {
314 public:
315 CAsn2FlatApp();
316 ~CAsn2FlatApp();
317
318 void Init() override;
319 int Run() override;
320
321 bool HandleSeqEntry(CRef<CSeq_entry>& se) override;
322 bool HandleSeqEntry(const CSeq_entry_Handle& seh);
323
324 protected:
325 void HandleSeqSubmit(CObjectIStream& is );
326 void HandleSeqSubmit(CSeq_submit& sub);
327 void HandleSeqId(const string& id);
328
329 CSeq_entry_Handle ObtainSeqEntryFromSeqEntry(CObjectIStream& is, bool report = false);
330 CSeq_entry_Handle ObtainSeqEntryFromBioseq(CObjectIStream& is, bool report = false);
331 CSeq_entry_Handle ObtainSeqEntryFromBioseqSet(CObjectIStream& is, bool report = false);
332
333 CNcbiOstream* OpenFlatfileOstream(const string& name);
334
335 private:
336 // types
337 typedef CFlatFileConfig::CGenbankBlockCallback TGenbankBlockCallback;
338
339 CObjectIStream* x_OpenIStream(const CArgs& args);
340
341 CFlatFileGenerator* x_CreateFlatFileGenerator(const CArgs& args);
342 TGenbankBlockCallback* x_GetGenbankCallback(const CArgs& args);
343 TSeqPos x_GetFrom(const CArgs& args);
344 TSeqPos x_GetTo (const CArgs& args);
345 ENa_strand x_GetStrand(const CArgs& args);
346 void x_GetLocation(const CSeq_entry_Handle& entry,
347 const CArgs& args, CSeq_loc& loc);
348 CBioseq_Handle x_DeduceTarget(const CSeq_entry_Handle& entry);
349 void x_CreateCancelBenchmarkCallback();
350 int x_AddSNPAnnots(CBioseq_Handle& bsh);
351
352 // data
353 CRef<CObjectManager> m_Objmgr; // Object Manager
354 CRef<CScope> m_Scope;
355
356 CNcbiOstream* m_Os; // all sequence output stream
357 bool m_OnlyNucs;
358 bool m_OnlyProts;
359
360 CNcbiOstream* m_On; // nucleotide output stream
361 CNcbiOstream* m_Og; // genomic output stream
362 CNcbiOstream* m_Or; // RNA output stream
363 CNcbiOstream* m_Op; // protein output stream
364 CNcbiOstream* m_Ou; // unknown output stream
365
366 CRef<CFlatFileGenerator> m_FFGenerator; // Flat-file generator
367 unique_ptr<ICanceled> m_pCanceledCallback;
368 bool m_do_cleanup;
369 bool m_Exception;
370 bool m_FetchFail;
371 bool m_PSGMode;
372 #ifdef USE_SNPLOADER
373 CRef<CSNPDataLoader> m_SNPDataLoader;
374 unique_ptr<ncbi::grpcapi::dbsnp::primary_track::DbSnpPrimaryTrack::Stub> m_SNPTrackStub;
375 #endif
376 #ifdef USE_CDDLOADER
377 CRef<CCDDDataLoader> m_CDDDataLoader;
378 #endif
379 };
380
381
382 // constructor
CAsn2FlatApp()383 CAsn2FlatApp::CAsn2FlatApp()
384 {
385 const CVersionInfo vers (6, NCBI_SC_VERSION_PROXY, NCBI_TEAMCITY_BUILD_NUMBER_PROXY);
386 SetVersion (vers);
387 }
388
389 // destructor
~CAsn2FlatApp()390 CAsn2FlatApp::~CAsn2FlatApp()
391 {
392 }
393
Init()394 void CAsn2FlatApp::Init()
395 {
396 unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
397 arg_desc->SetUsageContext("", "Convert an ASN.1 Seq-entry into a flat report");
398
399 // input
400 {{
401 arg_desc->SetCurrentGroup("Input/Output Options");
402 // name
403 arg_desc->AddOptionalKey("i", "InputFile",
404 "Input file name",
405 CArgDescriptions::eInputFile);
406
407 // input file serial format (AsnText\AsnBinary\XML, default: AsnText)
408 arg_desc->AddOptionalKey("serial", "SerialFormat", "Input file format",
409 CArgDescriptions::eString);
410 arg_desc->SetConstraint("serial", &(*new CArgAllow_Strings,
411 "text", "binary", "XML"));
412 arg_desc->AddFlag("sub", "Submission");
413 // id
414 arg_desc->AddOptionalKey("id", "ID",
415 "Specific ID to display",
416 CArgDescriptions::eString);
417 arg_desc->AddOptionalKey("ids", "IDFile",
418 "FIle of IDs to display, one per line",
419 CArgDescriptions::eInputFile);
420
421 // input type:
422 arg_desc->AddDefaultKey( "type", "AsnType", "ASN.1 object type",
423 CArgDescriptions::eString, "any" );
424 arg_desc->SetConstraint( "type",
425 &( *new CArgAllow_Strings, "any", "seq-entry", "bioseq", "bioseq-set", "seq-submit" ) );
426
427 // single output name
428 arg_desc->AddOptionalKey("o", "SingleOutputFile",
429 "Single output file name", CArgDescriptions::eOutputFile);
430
431 // file names
432 arg_desc->AddOptionalKey("on", "NucleotideOutputFile",
433 "Nucleotide output file name", CArgDescriptions::eOutputFile);
434 arg_desc->SetDependency("on", CArgDescriptions::eExcludes, "o");
435
436 arg_desc->AddOptionalKey("og", "GenomicOutputFile",
437 "Genomic output file name", CArgDescriptions::eOutputFile);
438 arg_desc->SetDependency("og", CArgDescriptions::eExcludes, "o");
439 arg_desc->SetDependency("og", CArgDescriptions::eExcludes, "on");
440
441 arg_desc->AddOptionalKey("or", "RNAOutputFile",
442 "RNA output file name", CArgDescriptions::eOutputFile);
443 arg_desc->SetDependency("or", CArgDescriptions::eExcludes, "o");
444 arg_desc->SetDependency("or", CArgDescriptions::eExcludes, "on");
445
446 arg_desc->AddOptionalKey("op", "ProteinOutputFile",
447 "Protein output file name", CArgDescriptions::eOutputFile);
448 arg_desc->SetDependency("op", CArgDescriptions::eExcludes, "o");
449
450 arg_desc->AddOptionalKey("ou", "UnknownOutputFile",
451 "Unknown output file name", CArgDescriptions::eOutputFile);
452 arg_desc->SetDependency("ou", CArgDescriptions::eExcludes, "o");
453 }}
454
455 // batch processing
456 {{
457 arg_desc->SetCurrentGroup("Batch Processing Options");
458 arg_desc->AddFlag("batch", "Process NCBI release file");
459 // compression
460 arg_desc->AddFlag("c", "Compressed file");
461 // propogate top descriptors
462 arg_desc->AddFlag("p", "Propagate top descriptors");
463 }}
464
465 // in flat_file_config.cpp
466 CFlatFileConfig::AddArgumentDescriptions(*arg_desc);
467
468 // debugging options
469 {{
470 arg_desc->SetCurrentGroup("Debugging Options - Subject to change or removal without warning");
471
472 // benchmark cancel-checking
473 arg_desc->AddFlag(
474 "benchmark-cancel-checking",
475 "Check statistics on how often the flatfile generator checks if "
476 "it should be canceled. This also sets up SIGUSR1 to trigger "
477 "cancellation." );
478 }}
479
480 CDataLoadersUtil::AddArgumentDescriptions(*arg_desc);
481
482 arg_desc->SetCurrentGroup(kEmptyStr);
483 SetupArgDescriptions(arg_desc.release());
484 }
485
486
OpenFlatfileOstream(const string & name)487 CNcbiOstream* CAsn2FlatApp::OpenFlatfileOstream(const string& name)
488 {
489 const CArgs& args = GetArgs();
490
491 if (! args[name])
492 return nullptr;
493
494 CNcbiOstream* flatfile_os = &(args[name].AsOutputFile());
495
496 // so we don't fail silently if, e.g., the output disk gets full
497 flatfile_os->exceptions( ios::failbit | ios::badbit );
498
499 return flatfile_os;
500 }
501
s_INSDSetOpen(bool is_insdseq,CNcbiOstream * os)502 static void s_INSDSetOpen(bool is_insdseq, CNcbiOstream* os) {
503
504 if (is_insdseq) {
505 *os << "<INSDSet>" << endl;
506 }
507 }
508
s_INSDSetClose(bool is_insdseq,CNcbiOstream * os)509 static void s_INSDSetClose(bool is_insdseq, CNcbiOstream* os) {
510
511 if (is_insdseq) {
512 *os << "</INSDSet>" << endl;
513 }
514 }
515
516
Run()517 int CAsn2FlatApp::Run()
518 {
519 CTime expires = GetFullVersion().GetBuildInfo().GetBuildTime();
520 if (!expires.IsEmpty())
521 {
522 expires.AddYear();
523 if (CTime(CTime::eCurrent) > expires)
524 {
525 NcbiCerr << "This copy of " << GetProgramDisplayName()
526 << " is more than 1 year old. Please download the current version if it is newer." << endl;
527 }
528 }
529
530 m_Exception = false;
531 m_FetchFail = false;
532
533 // initialize conn library
534 CONNECT_Init(&GetConfig());
535
536 const CArgs& args = GetArgs();
537
538 CSignal::TrapSignals(CSignal::eSignal_USR1);
539
540 // create object manager
541 m_Objmgr = CObjectManager::GetInstance();
542 if ( !m_Objmgr ) {
543 NCBI_THROW(CException, eUnknown, "Could not create object manager");
544 }
545 if (args["gbload"] || args["id"] || args["ids"]) {
546 static const CDataLoadersUtil::TLoaders default_loaders =
547 CDataLoadersUtil::fGenbank | CDataLoadersUtil::fVDB | CDataLoadersUtil::fSRA;
548 CDataLoadersUtil::SetupObjectManager(args, *m_Objmgr, default_loaders);
549
550 if (( args["enable-external"] && ! args["no-external"] ) || args["policy"].AsString() == "external" ) {
551 CGBDataLoader* gb_loader = dynamic_cast<CGBDataLoader*>(CObjectManager::GetInstance()->FindDataLoader("GBLOADER"));
552 if (gb_loader) {
553 // needed to find remote features when reading local ASN.1 file
554 gb_loader->CGBDataLoader::SetAlwaysLoadExternal(true);
555 }
556 }
557 }
558
559 m_Scope.Reset(new CScope(*m_Objmgr));
560 m_Scope->AddDefaults();
561
562 const CNcbiRegistry& cfg = CNcbiApplication::Instance()->GetConfig();
563 m_PSGMode = cfg.GetBool("genbank", "loader_psg", false, 0, CNcbiRegistry::eReturn);
564 if (m_PSGMode) {
565 #ifdef USE_SNPLOADER
566 string host = cfg.GetString("SNPAccess", "host", "");
567 string port = cfg.GetString("SNPAccess", "port", "");
568 string hostport = host + ":" + port;
569
570 auto channel =
571 grpc::CreateChannel(hostport, grpc::InsecureChannelCredentials());
572 m_SNPTrackStub =
573 ncbi::grpcapi::dbsnp::primary_track::DbSnpPrimaryTrack::NewStub(channel);
574 CRef<CSNPDataLoader> snp_data_loader(CSNPDataLoader::RegisterInObjectManager(*m_Objmgr).GetLoader());
575 m_Scope->AddDataLoader(snp_data_loader->GetLoaderNameFromArgs());
576 #endif
577 #ifdef USE_CDDLOADER
578 bool use_mongo_cdd =
579 cfg.GetBool("genbank", "vdb_cdd", false, 0, CNcbiRegistry::eReturn) &&
580 cfg.GetBool("genbank", "always_load_external", false, 0, CNcbiRegistry::eReturn);
581 if (use_mongo_cdd) {
582 CRef<CCDDDataLoader> cdd_data_loader(CCDDDataLoader::RegisterInObjectManager(*m_Objmgr).GetLoader());
583 m_Scope->AddDataLoader(cdd_data_loader->GetLoaderNameFromArgs());
584 }
585 #endif
586 }
587
588 // open the output streams
589 m_Os = OpenFlatfileOstream ("o");
590 m_On = OpenFlatfileOstream ("on");
591 m_Og = OpenFlatfileOstream ("og");
592 m_Or = OpenFlatfileOstream ("or");
593 m_Op = OpenFlatfileOstream ("op");
594 m_Ou = OpenFlatfileOstream ("ou");
595
596 m_OnlyNucs = false;
597 m_OnlyProts = false;
598 if (m_Os) {
599 const string& view = args["view"].AsString();
600 m_OnlyNucs = (view == "nuc");
601 m_OnlyProts = (view == "prot");
602 }
603
604 if (!m_Os && !m_On && !m_Og && !m_Or && !m_Op && !m_Ou) {
605 // No output (-o*) argument given - default to stdout
606 m_Os = &cout;
607 }
608
609 bool is_insdseq = false;
610 if (args["format"].AsString() == "insdseq") {
611 // only print <INSDSet> ... </INSDSet> wrappers if single output stream
612 if (m_Os) {
613 is_insdseq = true;
614 }
615 }
616
617 // create the flat-file generator
618 m_FFGenerator.Reset(x_CreateFlatFileGenerator(args));
619 if ( args["no-external"] || args["policy"].AsString() == "internal" ) {
620 m_FFGenerator->SetAnnotSelector().SetExcludeExternal(true);
621 }
622 // else if (!m_Scope->GetKeepExternalAnnotsForEdit()) {
623 // m_Scope->SetKeepExternalAnnotsForEdit();
624 // }
625 if( args["resolve-all"] || args["policy"].AsString() == "external") {
626 m_FFGenerator->SetAnnotSelector().SetResolveAll();
627 }
628 if( args["depth"] ) {
629 m_FFGenerator->SetAnnotSelector().SetResolveDepth(args["depth"].AsInteger());
630 }
631 if( args["max_search_segments"] ) {
632 m_FFGenerator->SetAnnotSelector().SetMaxSearchSegments(args["max_search_segments"].AsInteger());
633 }
634 if( args["max_search_time"] ) {
635 m_FFGenerator->SetAnnotSelector().SetMaxSearchTime(float(args["max_search_time"].AsDouble()));
636 }
637
638 unique_ptr<CObjectIStream> is;
639 is.reset( x_OpenIStream( args ) );
640 if (!is) {
641 string msg = args["i"]? "Unable to open input file" + args["i"].AsString() :
642 "Unable to read data from stdin";
643 NCBI_THROW(CException, eUnknown, msg);
644 }
645
646 if ( args[ "sub" ] ) {
647 s_INSDSetOpen ( is_insdseq, m_Os );
648 HandleSeqSubmit( *is );
649 s_INSDSetClose ( is_insdseq, m_Os );
650 if (m_Exception) return -1;
651 return 0;
652 }
653
654 if ( args[ "batch" ] ) {
655 s_INSDSetOpen ( is_insdseq, m_Os );
656 bool propagate = args[ "p" ];
657 CGBReleaseFile in( *is.release(), propagate );
658 in.RegisterHandler( this );
659 in.Read(); // HandleSeqEntry will be called from this function
660 s_INSDSetClose ( is_insdseq, m_Os );
661 if (m_Exception) return -1;
662 return 0;
663 }
664
665 if ( args[ "ids" ] ) {
666 s_INSDSetOpen ( is_insdseq, m_Os );
667 CNcbiIstream& istr = args["ids"].AsInputFile();
668 string id_str;
669 while (NcbiGetlineEOL(istr, id_str)) {
670 id_str = NStr::TruncateSpaces(id_str);
671 if (id_str.empty() || id_str[0] == '#') {
672 continue;
673 }
674
675 try {
676 LOG_POST(Error << "id = " << id_str);
677 HandleSeqId( id_str );
678 }
679 catch (CException& e) {
680 ERR_POST(Error << e);
681 }
682 }
683 s_INSDSetClose ( is_insdseq, m_Os );
684 if (m_Exception) return -1;
685 return 0;
686 }
687
688 if ( args[ "id" ] ) {
689 s_INSDSetOpen ( is_insdseq, m_Os );
690 HandleSeqId( args[ "id" ].AsString() );
691 s_INSDSetClose ( is_insdseq, m_Os );
692 if (m_Exception) return -1;
693 return 0;
694 }
695
696 string asn_type = args["type"].AsString();
697
698 s_INSDSetOpen ( is_insdseq, m_Os );
699 if ( asn_type == "seq-entry" ) {
700 //
701 // Straight through processing: Read a seq_entry, then process
702 // a seq_entry:
703 //
704 while ( !is->EndOfData() ) {
705 CSeq_entry_Handle seh = ObtainSeqEntryFromSeqEntry(*is, true);
706 if ( !seh ) {
707 NCBI_THROW(CException, eUnknown,
708 "Unable to construct Seq-entry object" );
709 }
710 HandleSeqEntry(seh);
711 m_Scope->RemoveTopLevelSeqEntry(seh);
712 }
713 }
714 else if ( asn_type == "bioseq" ) {
715 //
716 // Read object as a bioseq, wrap it into a seq_entry, then process
717 // the wrapped bioseq as a seq_entry:
718 //
719 while ( !is->EndOfData() ) {
720 CSeq_entry_Handle seh = ObtainSeqEntryFromBioseq(*is, true);
721 if ( !seh ) {
722 NCBI_THROW(CException, eUnknown,
723 "Unable to construct Seq-entry object" );
724 }
725 HandleSeqEntry(seh);
726 m_Scope->RemoveTopLevelSeqEntry(seh);
727 }
728 }
729 else if ( asn_type == "bioseq-set" ) {
730 //
731 // Read object as a bioseq_set, wrap it into a seq_entry, then
732 // process the wrapped bioseq_set as a seq_entry:
733 //
734 while ( !is->EndOfData() ) {
735 CSeq_entry_Handle seh = ObtainSeqEntryFromBioseqSet(*is, true);
736 if ( !seh ) {
737 NCBI_THROW(CException, eUnknown,
738 "Unable to construct Seq-entry object" );
739 }
740 HandleSeqEntry(seh);
741 m_Scope->RemoveTopLevelSeqEntry(seh);
742 }
743 }
744 else if ( asn_type == "seq-submit" ) {
745 while ( !is->EndOfData() ) {
746 HandleSeqSubmit( *is );
747 }
748 }
749 else if ( asn_type == "any" ) {
750 //
751 // Try the first four in turn:
752 //
753 while ( !is->EndOfData() ) {
754 string strNextTypeName = is->PeekNextTypeName();
755
756 CSeq_entry_Handle seh = ObtainSeqEntryFromSeqEntry(*is);
757 if ( !seh ) {
758 is->Close();
759 is.reset( x_OpenIStream( args ) );
760 seh = ObtainSeqEntryFromBioseqSet(*is);
761 if ( !seh ) {
762 is->Close();
763 is.reset( x_OpenIStream( args ) );
764 seh = ObtainSeqEntryFromBioseq(*is);
765 if ( !seh ) {
766 is->Close();
767 is.reset( x_OpenIStream( args ) );
768 CRef<CSeq_submit> sub(new CSeq_submit);
769 *is >> *sub;
770 if (sub->IsSetSub() && sub->IsSetData()) {
771 HandleSeqSubmit(*sub);
772 if (m_Exception) return -1;
773 return 0;
774 } else {
775 NCBI_THROW(
776 CException, eUnknown,
777 "Unable to construct Seq-entry object"
778 );
779 }
780 }
781 }
782 }
783 HandleSeqEntry(seh);
784 m_Scope->RemoveTopLevelSeqEntry(seh);
785 }
786 }
787 s_INSDSetClose ( is_insdseq, m_Os );
788
789 if (m_Exception) return -1;
790 return 0;
791 }
792
793
HandleSeqSubmit(CSeq_submit & sub)794 void CAsn2FlatApp::HandleSeqSubmit(CSeq_submit& sub)
795 {
796 if (!sub.IsSetSub() || !sub.IsSetData() || !sub.GetData().IsEntrys()) {
797 return;
798 }
799 CRef<CScope> scope(new CScope(*m_Objmgr));
800 scope->AddDefaults();
801 if (m_do_cleanup) {
802 CCleanup cleanup;
803 cleanup.BasicCleanup(sub);
804 }
805 const CArgs& args = GetArgs();
806 if (args["from"] || args["to"] || args["strand"]) {
807 CRef<CSeq_entry> e(sub.SetData().SetEntrys().front());
808 CSeq_entry_Handle seh;
809 try {
810 seh = scope->GetSeq_entryHandle(*e);
811 } catch (CException&) {}
812
813 if (!seh) { // add to scope if not already in it
814 seh = scope->AddTopLevelSeqEntry(*e);
815 }
816 // "remember" the submission block
817 m_FFGenerator->SetSubmit(sub.GetSub());
818 CSeq_loc loc;
819 x_GetLocation(seh, args, loc);
820 try {
821 m_FFGenerator->Generate(loc, seh.GetScope(), *m_Os);
822 }
823 catch (CException& e) {
824 ERR_POST(Error << e);
825 m_Exception = true;
826 }
827 } else {
828 try {
829 m_FFGenerator->Generate(sub, *scope, *m_Os);
830 }
831 catch (CException& e) {
832 ERR_POST(Error << e);
833 m_Exception = true;
834 }
835 }
836 }
837
838
839 // ============================================================================
HandleSeqSubmit(CObjectIStream & is)840 void CAsn2FlatApp::HandleSeqSubmit(CObjectIStream& is )
841 // ============================================================================
842 {
843 CRef<CSeq_submit> sub(new CSeq_submit);
844 is >> *sub;
845 HandleSeqSubmit(*sub);
846 }
847
848 // ============================================================================
HandleSeqId(const string & strId)849 void CAsn2FlatApp::HandleSeqId(
850 const string& strId )
851 // ============================================================================
852 {
853 CSeq_entry_Handle seh;
854
855 // This C++-scope gets a raw CSeq_entry that has no attachment
856 // to any CScope and puts it into entry.
857 {
858 CSeq_id id(strId);
859 CBioseq_Handle bsh = m_Scope->GetBioseqHandle( id );
860 if ( ! bsh ) {
861 NCBI_THROW(
862 CException, eUnknown,
863 "Unable to retrieve data for the given ID"
864 );
865 }
866 seh = bsh.GetParentEntry();
867 /*
868 CConstRef<CSeq_entry> ser = seh.GetTopLevelEntry().GetCompleteSeq_entry();
869 if (ser) {
870 cout << MSerial_AsnText << *ser << endl;
871 }
872 */
873 }
874
875 //
876 // ... and use that to generate the flat file:
877 //
878 HandleSeqEntry( seh );
879 }
880
881 // ============================================================================
HandleSeqEntry(const CSeq_entry_Handle & seh)882 bool CAsn2FlatApp::HandleSeqEntry(const CSeq_entry_Handle& seh )
883 // ============================================================================
884 {
885 const CArgs& args = GetArgs();
886
887 if ( m_do_cleanup ) {
888 CSeq_entry_EditHandle tseh = seh.GetTopLevelEntry().GetEditHandle();
889 CBioseq_set_EditHandle bseth;
890 CBioseq_EditHandle bseqh;
891 CRef<CSeq_entry> tmp_se(new CSeq_entry);
892 if ( tseh.IsSet() ) {
893 bseth = tseh.SetSet();
894 CConstRef<CBioseq_set> bset = bseth.GetCompleteObject();
895 bseth.Remove(bseth.eKeepSeq_entry);
896 tmp_se->SetSet(const_cast<CBioseq_set&>(*bset));
897 }
898 else {
899 bseqh = tseh.SetSeq();
900 CConstRef<CBioseq> bseq = bseqh.GetCompleteObject();
901 bseqh.Remove(bseqh.eKeepSeq_entry);
902 tmp_se->SetSeq(const_cast<CBioseq&>(*bseq));
903 }
904
905 CCleanup cleanup;
906 cleanup.BasicCleanup( *tmp_se );
907
908 if ( tmp_se->IsSet() ) {
909 tseh.SelectSet(bseth);
910 }
911 else {
912 tseh.SelectSeq(bseqh);
913 }
914 }
915
916 if ( args["faster"] || args["policy"].AsString() == "ftp" || args["policy"].AsString() == "web" ) {
917
918 try {
919 if ( args["from"] || args["to"] || args["strand"] ) {
920 CSeq_loc loc;
921 x_GetLocation( seh, args, loc );
922 CNcbiOstream* flatfile_os = m_Os;
923 m_FFGenerator->Generate(loc, seh.GetScope(), *flatfile_os, true, m_Os);
924 } else {
925 CNcbiOstream* flatfile_os = m_Os;
926 m_FFGenerator->Generate( seh, *flatfile_os, true, m_Os, m_On, m_Og, m_Or, m_Op, m_Ou);
927 }
928 }
929 catch (CException& e) {
930 ERR_POST(Error << e);
931 m_Exception = true;
932 }
933
934 return true;
935 }
936
937 m_FFGenerator->SetFeatTree(new feature::CFeatTree(seh));
938
939 for (CBioseq_CI bioseq_it(seh); bioseq_it; ++bioseq_it) {
940 CBioseq_Handle bsh = *bioseq_it;
941 CConstRef<CBioseq> bsr = bsh.GetCompleteBioseq();
942
943 CNcbiOstream* flatfile_os = nullptr;
944
945 bool is_genomic = false;
946 bool is_RNA = false;
947
948 CConstRef<CSeqdesc> closest_molinfo
949 = bsr->GetClosestDescriptor(CSeqdesc::e_Molinfo);
950 if (closest_molinfo) {
951 const CMolInfo& molinf = closest_molinfo->GetMolinfo();
952 CMolInfo::TBiomol biomol = molinf.GetBiomol();
953 switch (biomol) {
954 case NCBI_BIOMOL(genomic):
955 case NCBI_BIOMOL(other_genetic):
956 case NCBI_BIOMOL(genomic_mRNA):
957 case NCBI_BIOMOL(cRNA):
958 is_genomic = true;
959 break;
960 case NCBI_BIOMOL(pre_RNA):
961 case NCBI_BIOMOL(mRNA):
962 case NCBI_BIOMOL(rRNA):
963 case NCBI_BIOMOL(tRNA):
964 case NCBI_BIOMOL(snRNA):
965 case NCBI_BIOMOL(scRNA):
966 case NCBI_BIOMOL(snoRNA):
967 case NCBI_BIOMOL(transcribed_RNA):
968 case NCBI_BIOMOL(ncRNA):
969 case NCBI_BIOMOL(tmRNA):
970 is_RNA = true;
971 break;
972 case NCBI_BIOMOL(other):
973 {
974 CBioseq_Handle::TMol mol = bsh.GetSequenceType();
975 switch (mol) {
976 case CSeq_inst::eMol_dna:
977 is_genomic = true;
978 break;
979 case CSeq_inst::eMol_rna:
980 is_RNA = true;
981 break;
982 case CSeq_inst::eMol_na:
983 is_genomic = true;
984 break;
985 default:
986 break;
987 }
988 }
989 break;
990 default:
991 break;
992 }
993 }
994
995 if ( m_Os ) {
996 if ( m_OnlyNucs && ! bsh.IsNa() ) continue;
997 if ( m_OnlyProts && ! bsh.IsAa() ) continue;
998 flatfile_os = m_Os;
999 } else if ( bsh.IsNa() ) {
1000 if ( m_On ) {
1001 flatfile_os = m_On;
1002 } else if ( (is_genomic || ! closest_molinfo) && m_Og ) {
1003 flatfile_os = m_Og;
1004 } else if ( is_RNA && m_Or ) {
1005 flatfile_os = m_Or;
1006 } else {
1007 continue;
1008 }
1009 } else if ( bsh.IsAa() ) {
1010 if ( m_Op ) {
1011 flatfile_os = m_Op;
1012 }
1013 } else {
1014 if ( m_Ou ) {
1015 flatfile_os = m_Ou;
1016 } else if ( m_On ) {
1017 flatfile_os = m_On;
1018 } else {
1019 continue;
1020 }
1021 }
1022
1023 if ( !flatfile_os ) continue;
1024
1025 if (m_PSGMode && ( args["enable-external"] || args["policy"].AsString() == "external" ))
1026 x_AddSNPAnnots(bsh);
1027
1028 // generate flat file
1029 if ( args["from"] || args["to"] || args["strand"] ) {
1030 CSeq_loc loc;
1031 x_GetLocation( seh, args, loc );
1032 try {
1033 m_FFGenerator->Generate(loc, seh.GetScope(), *flatfile_os);
1034 }
1035 catch (CException& e) {
1036 ERR_POST(Error << e);
1037 m_Exception = true;
1038 }
1039 // emulate the C Toolkit: only produce flatfile for first sequence
1040 // when range is specified
1041 return true;
1042 }
1043 else {
1044 int count = args["count"].AsInteger();
1045 for ( int i = 0; i < count; ++i ) {
1046 try {
1047 m_FFGenerator->Generate( bsh, *flatfile_os);
1048 }
1049 catch (CException& e) {
1050 ERR_POST(Error << e);
1051 m_Exception = true;
1052 }
1053 }
1054
1055 }
1056 }
1057 return true;
1058 }
1059
ObtainSeqEntryFromSeqEntry(CObjectIStream & is,bool report)1060 CSeq_entry_Handle CAsn2FlatApp::ObtainSeqEntryFromSeqEntry(CObjectIStream& is, bool report)
1061 {
1062 try {
1063 CRef<CSeq_entry> se(new CSeq_entry);
1064 is >> *se;
1065 if (se->Which() == CSeq_entry::e_not_set) {
1066 NCBI_THROW(CException, eUnknown,
1067 "provided Seq-entry is empty");
1068 }
1069 return m_Scope->AddTopLevelSeqEntry(*se);
1070 }
1071 catch (CException& e) {
1072 if (report) {
1073 ERR_POST(Error << e);
1074 }
1075 }
1076 return CSeq_entry_Handle();
1077 }
1078
ObtainSeqEntryFromBioseq(CObjectIStream & is,bool report)1079 CSeq_entry_Handle CAsn2FlatApp::ObtainSeqEntryFromBioseq(CObjectIStream& is, bool report)
1080 {
1081 try {
1082 CRef<CBioseq> bs(new CBioseq);
1083 is >> *bs;
1084 CBioseq_Handle bsh = m_Scope->AddBioseq(*bs);
1085 return bsh.GetTopLevelEntry();
1086 }
1087 catch (CException& e) {
1088 if (report) {
1089 ERR_POST(Error << e);
1090 }
1091 }
1092 return CSeq_entry_Handle();
1093 }
1094
ObtainSeqEntryFromBioseqSet(CObjectIStream & is,bool report)1095 CSeq_entry_Handle CAsn2FlatApp::ObtainSeqEntryFromBioseqSet(CObjectIStream& is, bool report)
1096 {
1097 try {
1098 CRef<CSeq_entry> entry(new CSeq_entry);
1099 is >> entry->SetSet();
1100 return m_Scope->AddTopLevelSeqEntry(*entry);
1101 }
1102 catch (CException& e) {
1103 if (report) {
1104 ERR_POST(Error << e);
1105 }
1106 }
1107 return CSeq_entry_Handle();
1108 }
1109
1110
HandleSeqEntry(CRef<CSeq_entry> & se)1111 bool CAsn2FlatApp::HandleSeqEntry(CRef<CSeq_entry>& se)
1112 {
1113 if (!se) {
1114 return false;
1115 }
1116
1117 // add entry to scope
1118 CSeq_entry_Handle entry = m_Scope->AddTopLevelSeqEntry(*se);
1119 if ( !entry ) {
1120 NCBI_THROW(CException, eUnknown, "Failed to insert entry to scope.");
1121 }
1122
1123 bool ret = HandleSeqEntry(entry);
1124 // Needed because we can really accumulate a lot of junk otherwise,
1125 // and we end up with significant slowdown due to repeatedly doing
1126 // linear scans on a growing CScope.
1127 m_Scope->ResetDataAndHistory();
1128 return ret;
1129 }
1130
x_OpenIStream(const CArgs & args)1131 CObjectIStream* CAsn2FlatApp::x_OpenIStream(const CArgs& args)
1132 {
1133 // determine the file serialization format.
1134 // default for batch files is binary, otherwise text.
1135 ESerialDataFormat serial = args["batch"] ? eSerial_AsnBinary : eSerial_AsnText;
1136 if ( args["serial"] ) {
1137 const string& val = args["serial"].AsString();
1138 if ( val == "text" ) {
1139 serial = eSerial_AsnText;
1140 } else if ( val == "binary" ) {
1141 serial = eSerial_AsnBinary;
1142 } else if ( val == "XML" ) {
1143 serial = eSerial_Xml;
1144 }
1145 }
1146
1147 // make sure of the underlying input stream. If -i was given on the command line
1148 // then the input comes from a file. Otherwise, it comes from stdin:
1149 CNcbiIstream* pInputStream = &NcbiCin;
1150 bool bDeleteOnClose = false;
1151 if ( args["i"] ) {
1152 pInputStream = new CNcbiIfstream( args["i"].AsString(), ios::binary );
1153 bDeleteOnClose = true;
1154 }
1155
1156 // if -c was specified then wrap the input stream into a gzip decompressor before
1157 // turning it into an object stream:
1158 CObjectIStream* pI = nullptr;
1159 if ( args["c"] ) {
1160 CZipStreamDecompressor* pDecompressor = new CZipStreamDecompressor(
1161 512, 512, kZlibDefaultWbits, CZipCompression::fCheckFileHeader );
1162 CCompressionIStream* pUnzipStream = new CCompressionIStream(
1163 *pInputStream, pDecompressor, CCompressionIStream::fOwnProcessor );
1164 pI = CObjectIStream::Open( serial, *pUnzipStream, eTakeOwnership );
1165 }
1166 else {
1167 pI = CObjectIStream::Open(
1168 serial, *pInputStream, (bDeleteOnClose ? eTakeOwnership : eNoOwnership));
1169 }
1170
1171 if ( pI ) {
1172 pI->UseMemoryPool();
1173 }
1174 return pI;
1175 }
1176
1177
x_CreateFlatFileGenerator(const CArgs & args)1178 CFlatFileGenerator* CAsn2FlatApp::x_CreateFlatFileGenerator(const CArgs& args)
1179 {
1180 CFlatFileConfig cfg;
1181 cfg.FromArguments(args);
1182 m_do_cleanup = ( ! args["nocleanup"]);
1183 cfg.BasicCleanup(false);
1184
1185 if (args["html"])
1186 {
1187 CRef<IHTMLFormatter> html_fmt(new CHTMLFormatterEx(m_Scope));
1188 cfg.SetHTMLFormatter(html_fmt);
1189 }
1190
1191 CRef<TGenbankBlockCallback> genbank_callback( x_GetGenbankCallback(args) );
1192
1193 if( args["benchmark-cancel-checking"] ) {
1194 x_CreateCancelBenchmarkCallback();
1195 }
1196
1197 //CFlatFileConfig cfg(
1198 // format, mode, style, flags, view, gff_options, genbank_blocks,
1199 // genbank_callback.GetPointerOrNull(), m_pCanceledCallback.get(),
1200 // args["cleanup"] );
1201 CFlatFileGenerator* generator = new CFlatFileGenerator(cfg);
1202
1203 // ID-5865 : SNP annotations must be explicitly added to the annot selector
1204 if (!m_PSGMode && cfg.ShowSNPFeatures()) {
1205 cfg.SetHideSNPFeatures(false);
1206 generator->SetAnnotSelector().IncludeNamedAnnotAccession("SNP");
1207 }
1208
1209 return generator;
1210 }
1211
1212 CAsn2FlatApp::TGenbankBlockCallback*
x_GetGenbankCallback(const CArgs & args)1213 CAsn2FlatApp::x_GetGenbankCallback(const CArgs& args)
1214 {
1215 class CSimpleCallback : public TGenbankBlockCallback {
1216 public:
1217 CSimpleCallback() { }
1218 ~CSimpleCallback() override
1219 {
1220 cerr << endl;
1221 cerr << "GENBANK CALLBACK DEMO STATISTICS" << endl;
1222 cerr << endl;
1223 cerr << "Appearances of each type: " << endl;
1224 cerr << endl;
1225 x_DumpTypeToCountMap(m_TypeAppearancesMap);
1226 cerr << endl;
1227 cerr << "Total characters output for each type: " << endl;
1228 cerr << endl;
1229 x_DumpTypeToCountMap(m_TypeCharsMap);
1230 cerr << endl;
1231 cerr << "Average characters output for each type: " << endl;
1232 cerr << endl;
1233 x_PrintAverageStats();
1234 }
1235
1236 EBioseqSkip notify_bioseq( CBioseqContext & ctx ) override
1237 {
1238 cerr << "notify_bioseq called." << endl;
1239 return eBioseqSkip_No;
1240 }
1241
1242 // this macro is the lesser evil compared to the messiness that
1243 // you would see here otherwise. (plus it's less error-prone)
1244 #define SIMPLE_CALLBACK_NOTIFY(TItemClass) \
1245 EAction notify( string & block_text, \
1246 const CBioseqContext& ctx, \
1247 const TItemClass & item ) override { \
1248 NStr::ReplaceInPlace(block_text, "MODIFY TEST", "WAS MODIFIED TEST" ); \
1249 cerr << #TItemClass << " {" << block_text << '}' << endl; \
1250 ++m_TypeAppearancesMap[#TItemClass]; \
1251 ++m_TypeAppearancesMap["TOTAL"]; \
1252 m_TypeCharsMap[#TItemClass] += block_text.length(); \
1253 m_TypeCharsMap["TOTAL"] += block_text.length(); \
1254 EAction eAction = eAction_Default; \
1255 if( block_text.find("SKIP TEST") != string::npos ) { \
1256 eAction = eAction_Skip; \
1257 } \
1258 if ( block_text.find("HALT TEST") != string::npos ) { \
1259 eAction = eAction_HaltFlatfileGeneration; \
1260 } \
1261 return eAction; }
1262
1263 SIMPLE_CALLBACK_NOTIFY(CStartSectionItem);
1264 SIMPLE_CALLBACK_NOTIFY(CHtmlAnchorItem);
1265 SIMPLE_CALLBACK_NOTIFY(CLocusItem);
1266 SIMPLE_CALLBACK_NOTIFY(CDeflineItem);
1267 SIMPLE_CALLBACK_NOTIFY(CAccessionItem);
1268 SIMPLE_CALLBACK_NOTIFY(CVersionItem);
1269 SIMPLE_CALLBACK_NOTIFY(CGenomeProjectItem);
1270 SIMPLE_CALLBACK_NOTIFY(CDBSourceItem);
1271 SIMPLE_CALLBACK_NOTIFY(CKeywordsItem);
1272 SIMPLE_CALLBACK_NOTIFY(CSegmentItem);
1273 SIMPLE_CALLBACK_NOTIFY(CSourceItem);
1274 SIMPLE_CALLBACK_NOTIFY(CReferenceItem);
1275 SIMPLE_CALLBACK_NOTIFY(CCommentItem);
1276 SIMPLE_CALLBACK_NOTIFY(CPrimaryItem);
1277 SIMPLE_CALLBACK_NOTIFY(CFeatHeaderItem);
1278 SIMPLE_CALLBACK_NOTIFY(CSourceFeatureItem);
1279 SIMPLE_CALLBACK_NOTIFY(CFeatureItem);
1280 SIMPLE_CALLBACK_NOTIFY(CGapItem);
1281 SIMPLE_CALLBACK_NOTIFY(CBaseCountItem);
1282 SIMPLE_CALLBACK_NOTIFY(COriginItem);
1283 SIMPLE_CALLBACK_NOTIFY(CSequenceItem);
1284 SIMPLE_CALLBACK_NOTIFY(CContigItem);
1285 SIMPLE_CALLBACK_NOTIFY(CWGSItem);
1286 SIMPLE_CALLBACK_NOTIFY(CTSAItem);
1287 SIMPLE_CALLBACK_NOTIFY(CEndSectionItem);
1288 #undef SIMPLE_CALLBACK_NOTIFY
1289
1290 private:
1291 typedef map<string, size_t> TTypeToCountMap;
1292 // for each type, how many instances of that type did we see?
1293 // We use the special string "TOTAL" for a total count.
1294 TTypeToCountMap m_TypeAppearancesMap;
1295 // Like m_TypeAppearancesMap but counts total number of *characters*
1296 // instead of number of items. Again, there is
1297 // the special value "TOTAL"
1298 TTypeToCountMap m_TypeCharsMap;
1299
1300 void x_DumpTypeToCountMap(const TTypeToCountMap & the_map ) {
1301 ITERATE( TTypeToCountMap, map_iter, the_map ) {
1302 cerr << setw(25) << left << (map_iter->first + ':')
1303 << " " << map_iter->second << endl;
1304 }
1305 }
1306
1307 void x_PrintAverageStats() {
1308 ITERATE( TTypeToCountMap, map_iter, m_TypeAppearancesMap ) {
1309 const string sType = map_iter->first;
1310 const size_t iAppearances = map_iter->second;
1311 const size_t iTotalCharacters = m_TypeCharsMap[sType];
1312 cerr << setw(25) << left << (sType + ':')
1313 << " " << (iTotalCharacters / iAppearances) << endl;
1314 }
1315 }
1316 };
1317
1318 if( args["demo-genbank-callback"] ) {
1319 return new CSimpleCallback;
1320 } else {
1321 return nullptr;
1322 }
1323 }
1324
x_GetFrom(const CArgs & args)1325 TSeqPos CAsn2FlatApp::x_GetFrom(const CArgs& args)
1326 {
1327 return args["from"] ?
1328 static_cast<TSeqPos>(args["from"].AsInteger() - 1) :
1329 CRange<TSeqPos>::GetWholeFrom();
1330 }
1331
1332
x_GetTo(const CArgs & args)1333 TSeqPos CAsn2FlatApp::x_GetTo(const CArgs& args)
1334 {
1335 return args["to"] ?
1336 static_cast<TSeqPos>(args["to"].AsInteger() - 1) :
1337 CRange<TSeqPos>::GetWholeTo();
1338 }
1339
x_GetStrand(const CArgs & args)1340 ENa_strand CAsn2FlatApp::x_GetStrand(const CArgs& args)
1341 {
1342 return static_cast<ENa_strand>(args["strand"].AsInteger());
1343 }
1344
1345
x_GetLocation(const CSeq_entry_Handle & entry,const CArgs & args,CSeq_loc & loc)1346 void CAsn2FlatApp::x_GetLocation
1347 (const CSeq_entry_Handle& entry,
1348 const CArgs& args,
1349 CSeq_loc& loc)
1350 {
1351 _ASSERT(entry);
1352
1353 CBioseq_Handle h = x_DeduceTarget(entry);
1354 if ( !h ) {
1355 NCBI_THROW(CFlatException, eInvalidParam,
1356 "Cannot deduce target bioseq.");
1357 }
1358 TSeqPos length = h.GetInst_Length();
1359 TSeqPos from = x_GetFrom(args);
1360 TSeqPos to = min(x_GetTo(args), length-1);
1361 ENa_strand strand = eNa_strand_unknown;
1362 if ( args["strand"] ) {
1363 strand = x_GetStrand(args);
1364 }
1365
1366 if ( from == CRange<TSeqPos>::GetWholeFrom() && to == length ) {
1367 // whole
1368 loc.SetWhole().Assign(*h.GetSeqId());
1369 } else {
1370 // interval
1371 loc.SetInt().SetId().Assign(*h.GetSeqId());
1372 loc.SetInt().SetFrom(from);
1373 loc.SetInt().SetTo(to);
1374 if ( strand > 0 ) {
1375 loc.SetInt().SetStrand(strand);
1376 }
1377 }
1378 }
1379
1380
1381 // if the 'from' or 'to' flags are specified try to guess the bioseq.
x_DeduceTarget(const CSeq_entry_Handle & entry)1382 CBioseq_Handle CAsn2FlatApp::x_DeduceTarget(const CSeq_entry_Handle& entry)
1383 {
1384 if ( entry.IsSeq() ) {
1385 return entry.GetSeq();
1386 }
1387
1388 _ASSERT(entry.IsSet());
1389 CBioseq_set_Handle bsst = entry.GetSet();
1390 if ( !bsst.IsSetClass() ) {
1391 NCBI_THROW(CFlatException, eInvalidParam,
1392 "Cannot deduce target bioseq.");
1393 }
1394 _ASSERT(bsst.IsSetClass());
1395 switch ( bsst.GetClass() ) {
1396 case CBioseq_set::eClass_nuc_prot:
1397 // return the nucleotide
1398 for ( CSeq_entry_CI it(entry); it; ++it ) {
1399 if ( it->IsSeq() ) {
1400 CBioseq_Handle h = it->GetSeq();
1401 if ( h && CSeq_inst::IsNa(h.GetInst_Mol()) ) {
1402 return h;
1403 }
1404 }
1405 }
1406 break;
1407 case CBioseq_set::eClass_gen_prod_set:
1408 // return the genomic
1409 for ( CSeq_entry_CI it(bsst); it; ++it ) {
1410 if ( it->IsSeq() &&
1411 it->GetSeq().GetInst_Mol() == CSeq_inst::eMol_dna ) {
1412 return it->GetSeq();
1413 }
1414 }
1415 break;
1416 case CBioseq_set::eClass_segset:
1417 // return the segmented bioseq
1418 for ( CSeq_entry_CI it(bsst); it; ++it ) {
1419 if ( it->IsSeq() ) {
1420 return it->GetSeq();
1421 }
1422 }
1423 break;
1424 case CBioseq_set::eClass_genbank:
1425 {
1426 CBioseq_CI bi(bsst, CSeq_inst::eMol_na);
1427 if (bi) {
1428 return *bi;
1429 }
1430 }
1431 break;
1432 default:
1433 break;
1434 }
1435 NCBI_THROW(CFlatException, eInvalidParam,
1436 "Cannot deduce target bioseq.");
1437 }
1438
1439 void
x_CreateCancelBenchmarkCallback()1440 CAsn2FlatApp::x_CreateCancelBenchmarkCallback()
1441 {
1442 // This ICanceled interface always says "keep going"
1443 // unless SIGUSR1 is received,
1444 // and it keeps statistics on how often it is checked
1445 class CCancelBenchmarking : public ICanceled {
1446 public:
1447 CCancelBenchmarking()
1448 : m_TimeOfLastCheck(CTime::eCurrent)
1449 {
1450 }
1451
1452
1453 ~CCancelBenchmarking()
1454 {
1455 // On destruction, we call "IsCanceled" one more time to make
1456 // sure there wasn't a point where we stopped
1457 // checking IsCanceled.
1458 IsCanceled();
1459
1460 // print statistics
1461 cerr << "##### Cancel-checking benchmarks:" << endl;
1462 cerr << endl;
1463
1464 // maybe cancelation was never checked:
1465 if( m_GapSizeMap.empty() ) {
1466 cerr << "NO DATA" << endl;
1467 return;
1468 }
1469
1470 cerr << "(all times in milliseconds)" << endl;
1471 cerr << endl;
1472 // easy stats
1473 cerr << "Min: " << m_GapSizeMap.begin()->first << endl;
1474 cerr << "Max: " << m_GapSizeMap.rbegin()->first << endl;
1475
1476 // find average
1477 Int8 iTotalMsecs = 0;
1478 size_t uTotalCalls = 0;
1479 ITERATE( TGapSizeMap, gap_size_it, m_GapSizeMap ) {
1480 iTotalMsecs += gap_size_it->first * gap_size_it->second;
1481 uTotalCalls += gap_size_it->second;
1482 }
1483 cerr << "Avg: " << (iTotalMsecs / uTotalCalls) << endl;
1484
1485 // find median
1486 const size_t uIdxWithMedian = (uTotalCalls / 2);
1487 size_t uCurrentIdx = 0;
1488 ITERATE( TGapSizeMap, gap_size_it, m_GapSizeMap ) {
1489 uCurrentIdx += gap_size_it->second;
1490 if( uCurrentIdx >= uIdxWithMedian ) {
1491 cerr << "Median: " << gap_size_it->first << endl;
1492 break;
1493 }
1494 }
1495
1496 // first few
1497 cerr << "Chronologically first few check times: " << endl;
1498 copy( m_FirstFewGaps.begin(),
1499 m_FirstFewGaps.end(),
1500 ostream_iterator<Int8>(cerr, "\n") );
1501
1502 // last few
1503 cerr << "Chronologically last few check times: " << endl;
1504 copy( m_LastFewGaps.begin(),
1505 m_LastFewGaps.end(),
1506 ostream_iterator<Int8>(cerr, "\n") );
1507
1508 // slowest few and fastest few
1509 cerr << "Frequency distribution of slowest and fastest lookup times: " << endl;
1510 cerr << "\t" << "Time" << "\t" << "Count" << endl;
1511 if( m_GapSizeMap.size() <= (2 * x_kGetNumToSaveAtStartAndEnd()) ) {
1512 // if small enough, show the whole table at once
1513 ITERATE( TGapSizeMap, gap_size_it, m_GapSizeMap ) {
1514 cerr << "\t" << gap_size_it ->first << "\t" << gap_size_it->second << endl;
1515 }
1516 } else {
1517 // table is big so only show the first few and the last few
1518 TGapSizeMap::const_iterator gap_size_it = m_GapSizeMap.begin();
1519 ITERATE_SIMPLE(x_kGetNumToSaveAtStartAndEnd()) {
1520 cerr << "\t" << gap_size_it ->first << "\t" << gap_size_it->second << endl;
1521 ++gap_size_it;
1522 }
1523
1524 cout << "\t...\t..." << endl;
1525
1526 TGapSizeMap::reverse_iterator gap_size_rit = m_GapSizeMap.rbegin();
1527 ITERATE_SIMPLE(x_kGetNumToSaveAtStartAndEnd()) {
1528 cerr << "\t" << gap_size_it ->first << "\t" << gap_size_it->second << endl;
1529 ++gap_size_rit;
1530 }
1531 }
1532
1533 // total checks
1534 cerr << "Num checks: " << uTotalCalls << endl;
1535 }
1536
1537
1538 bool IsCanceled() const override
1539 {
1540 // getting the current time should be the first
1541 // command in this function.
1542 CTime timeOfThisCheck(CTime::eCurrent);
1543
1544 const double dDiffInNsecs =
1545 timeOfThisCheck.DiffNanoSecond(m_TimeOfLastCheck);
1546 const Int8 iDiffInMsecs = static_cast<Int8>(dDiffInNsecs / 1000000.0);
1547
1548 ++m_GapSizeMap[iDiffInMsecs];
1549
1550 if( m_FirstFewGaps.size() < x_kGetNumToSaveAtStartAndEnd() ) {
1551 m_FirstFewGaps.push_back(iDiffInMsecs);
1552 }
1553
1554 m_LastFewGaps.push_back(iDiffInMsecs);
1555 if( m_LastFewGaps.size() > x_kGetNumToSaveAtStartAndEnd() ) {
1556 m_LastFewGaps.pop_front();
1557 }
1558
1559 const bool bIsCanceled =
1560 CSignal::IsSignaled(CSignal::eSignal_USR1);
1561 if( bIsCanceled ) {
1562 cerr << "Canceled by SIGUSR1" << endl;
1563 }
1564
1565 // resetting m_TimeOfLastCheck should be the last command
1566 // in this function
1567 m_TimeOfLastCheck.SetCurrent();
1568 return bIsCanceled;
1569 }
1570
1571 private:
1572 // local classes do not allow static fields
1573 size_t x_kGetNumToSaveAtStartAndEnd() const { return 10; }
1574
1575 mutable CTime m_TimeOfLastCheck;
1576 // The key is the gap between checks in milliseconds,
1577 // (which is more than enough resolution for a human-level action)
1578 // and the value is the number of times a gap of that size occurred.
1579 typedef map<Int8, size_t> TGapSizeMap;
1580 mutable TGapSizeMap m_GapSizeMap;
1581
1582 mutable vector<Int8> m_FirstFewGaps;
1583 mutable list<Int8> m_LastFewGaps;
1584 };
1585
1586 m_pCanceledCallback.reset( new CCancelBenchmarking );
1587 }
1588
x_AddSNPAnnots(CBioseq_Handle & bsh)1589 int CAsn2FlatApp::x_AddSNPAnnots(CBioseq_Handle& bsh)
1590 {
1591 int rc = 0;
1592
1593 // SNP annotations can be available only for nucleotide human RefSeq records
1594 if (bsh.GetInst_Mol() == CSeq_inst::eMol_aa ||
1595 sequence::GetTaxId(bsh) != TAX_ID_CONST(9606))
1596 return 0;
1597
1598 // Also skip large scaffolds and chromosomes
1599 CConstRef<CSeq_id> accid =
1600 FindBestChoice(bsh.GetBioseqCore()->GetId(), CSeq_id::Score);
1601
1602 bool skip = (accid->Which() != CSeq_id::e_Other);
1603 if (!skip) {
1604 string acc;
1605 accid->GetLabel(&acc, CSeq_id::eContent);
1606 string acc_prefix = acc.substr(0,2);
1607 if (acc_prefix == "NC" || acc_prefix == "AC" ||
1608 acc_prefix == "NT" || acc_prefix == "NW") {
1609 skip = true;
1610 }
1611 }
1612 if (skip)
1613 return 0;
1614
1615 // If GenBank loader is connecting to PubSeqOS, it's sufficient to add the 'SNP'
1616 // named annot type to the scope.
1617 // Otherwise (in PSG mode), use a separate SNP data loader. For that to work,
1618 // it is necessary to find the actual NA accession corresponding to this record's
1619 // SNP annotation and add it to the SAnnotSelector used by the flatfile generator.
1620 #ifdef USE_SNPLOADER
1621 TGi gi = FindGi(bsh.GetBioseqCore()->GetId());
1622 if (gi > ZERO_GI) {
1623 ncbi::grpcapi::dbsnp::primary_track::SeqIdRequestStringAccverUnion request;
1624 request.set_gi(GI_TO(::google::protobuf::uint64, gi));
1625 ncbi::grpcapi::dbsnp::primary_track::PrimaryTrackReply reply;
1626 CGRPCClientContext context;
1627 auto snp_status = m_SNPTrackStub->ForSeqId(&context, request, &reply);
1628 if (snp_status.ok()) {
1629 string na_acc = reply.na_track_acc_with_filter();
1630 if (!na_acc.empty())
1631 m_FFGenerator->SetAnnotSelector().IncludeNamedAnnotAccession(na_acc);
1632 }
1633 }
1634 #endif
1635
1636 return rc;
1637 }
1638
1639 END_NCBI_SCOPE
1640
1641 USING_NCBI_SCOPE;
1642
1643
1644 /////////////////////////////////////////////////////////////////////////////
1645 //
1646 // Main
1647
main(int argc,const char ** argv)1648 int main(int argc, const char** argv)
1649 {
1650 CScope::SetDefaultKeepExternalAnnotsForEdit(true);
1651 return CAsn2FlatApp().AppMain(argc, argv);
1652 }
1653
1654