1 /*  $Id: asn2flat.cpp 637424 2021-09-13 13:12:39Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Aaron Ucko, Mati Shomrat, Mike DiCuccio, Jonathan Kans, NCBI
27 *
28 * File Description:
29 *   flat-file generator application
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <common/ncbi_source_ver.h>
35 #include <corelib/ncbiapp.hpp>
36 #include <corelib/ncbistd.hpp>
37 #include <corelib/ncbi_signal.hpp>
38 #include <connect/ncbi_core_cxx.hpp>
39 
40 #include <serial/serial.hpp>
41 #include <serial/objistr.hpp>
42 #include <serial/serial.hpp>
43 
44 #include <objects/seqset/Seq_entry.hpp>
45 #include <objects/seqloc/Seq_loc.hpp>
46 #include <objects/seqloc/Seq_interval.hpp>
47 #include <objects/submit/Seq_submit.hpp>
48 #include <objects/seq/seq_macros.hpp>
49 
50 #include <objmgr/object_manager.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/seq_entry_ci.hpp>
53 #include <objmgr/util/sequence.hpp>
54 
55 #include <objtools/cleanup/cleanup.hpp>
56 #include <objtools/format/flat_file_config.hpp>
57 #include <objtools/format/flat_file_generator.hpp>
58 #include <objtools/format/flat_expt.hpp>
59 #include <objects/seqset/gb_release_file.hpp>
60 
61 #include <util/compress/zlib.hpp>
62 #include <util/compress/stream.hpp>
63 
64 #include <objmgr/util/objutil.hpp>
65 
66 #include <misc/data_loaders_util/data_loaders_util.hpp>
67 #include <objtools/data_loaders/genbank/gbloader.hpp>
68 
69 #define USE_CDDLOADER
70 
71 #if defined(HAVE_LIBGRPC) && defined(HAVE_NCBI_VDB)
72 #define USE_SNPLOADER
73 #endif
74 
75 #ifdef USE_SNPLOADER
76 // ID-5865 : For SNP retrieval in PSG mode via SNP data loader
77 #  include <misc/grpc_integration/grpc_integration.hpp>
78 #  include <grpc++/grpc++.h>
79 #  include <objects/dbsnp/primary_track/dbsnp.grpc.pb.h>
80 #  include <objects/dbsnp/primary_track/dbsnp.pb.h>
81 #endif
82 
83 #ifdef USE_CDDLOADER
84     #include <sra/data_loaders/snp/snploader.hpp>
85     #include <objtools/data_loaders/cdd/cdd_loader/cdd_loader.hpp>
86 #endif
87 
88 
89 
90 BEGIN_NCBI_SCOPE
91 USING_SCOPE(objects);
92 
93 class CHTMLFormatterEx : public IHTMLFormatter
94 {
95 public:
96     CHTMLFormatterEx(CRef<CScope> scope);
97     ~CHTMLFormatterEx() override;
98 
99     void FormatProteinId(string&, const CSeq_id& seq_id, const string& prot_id) const override;
100     void FormatTranscriptId(string &, const CSeq_id& seq_id, const string& nuc_id) const override;
101     void FormatNucSearch(CNcbiOstream& os, const string& id) const override;
102     void FormatNucId(string& str, const CSeq_id& seq_id, TIntId gi, const string& acc_id) const override;
103     void FormatTaxid(string& str, const TTaxId taxid, const string& taxname) const override;
104     void FormatLocation(string& str, const CSeq_loc& loc, TIntId gi, const string& visible_text) const override;
105     void FormatModelEvidence(string& str, const SModelEvidance& me) const override;
106     void FormatTranscript(string& str, const string& name) const override;
107     void FormatGeneralId(CNcbiOstream& os, const string& id) const override;
108     void FormatGapLink(CNcbiOstream& os, TSeqPos gap_size, const string& id, bool is_prot) const override;
109     void FormatUniProtId(string& str, const string& prot_id) const override;
110 
111 private:
112     mutable CRef<CScope> m_scope;
113 };
114 
115 
CHTMLFormatterEx(CRef<CScope> scope)116 CHTMLFormatterEx::CHTMLFormatterEx(CRef<CScope> scope) : m_scope(scope)
117 {
118 }
119 
~CHTMLFormatterEx()120 CHTMLFormatterEx::~CHTMLFormatterEx()
121 {
122 }
123 
FormatProteinId(string & str,const CSeq_id & seq_id,const string & prot_id) const124 void CHTMLFormatterEx::FormatProteinId(string& str, const CSeq_id& seq_id, const string& prot_id) const
125 {
126     string index = prot_id;
127     CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id);
128     vector< CSeq_id_Handle > ids = bsh.GetId();
129     ITERATE(vector< CSeq_id_Handle >, it, ids) {
130         CSeq_id_Handle hid = *it;
131         if (hid.IsGi()) {
132             index = NStr::NumericToString(hid.GetGi());
133             break;
134         }
135     }
136     str = "<a href=\"";
137     str += strLinkBaseProt;
138     str += index;
139     str += "\">";
140     str += prot_id;
141     str += "</a>";
142 }
143 
FormatTranscriptId(string & str,const CSeq_id & seq_id,const string & nuc_id) const144 void CHTMLFormatterEx::FormatTranscriptId(string& str, const CSeq_id& seq_id, const string& nuc_id) const
145 {
146     string index = nuc_id;
147     CBioseq_Handle bsh = m_scope->GetBioseqHandle(seq_id);
148     vector< CSeq_id_Handle > ids = bsh.GetId();
149     ITERATE(vector< CSeq_id_Handle >, it, ids) {
150         CSeq_id_Handle hid = *it;
151         if (hid.IsGi()) {
152             index = NStr::NumericToString(hid.GetGi());
153             break;
154         }
155     }
156     str = "<a href=\"";
157     str += strLinkBaseNuc;
158     str += index;
159     str += "\">";
160     str += nuc_id;
161     str += "</a>";
162 }
163 
FormatNucId(string & str,const CSeq_id & seq_id,TIntId gi,const string & acc_id) const164 void CHTMLFormatterEx::FormatNucId(string& str, const CSeq_id& seq_id, TIntId gi, const string& acc_id) const
165 {
166     str = "<a href=\"";
167     str += strLinkBaseNuc + NStr::NumericToString(gi) + "\">" + acc_id + "</a>";
168 }
169 
FormatTaxid(string & str,const TTaxId taxid,const string & taxname) const170 void CHTMLFormatterEx::FormatTaxid(string& str, const TTaxId taxid, const string& taxname) const
171 {
172     if (!NStr::StartsWith(taxname, "Unknown", NStr::eNocase)) {
173         if (taxid > ZERO_TAX_ID) {
174             str += "<a href=\"";
175             str += strLinkBaseTaxonomy;
176             str += "id=";
177             str += NStr::NumericToString(taxid);
178             str += "\">";
179         }
180         else {
181             string t_taxname = taxname;
182             replace(t_taxname.begin(), t_taxname.end(), ' ', '+');
183             str += "<a href=\"";
184             str += strLinkBaseTaxonomy;
185             str += "name=";
186             str += taxname;
187             str += "\">";
188         }
189         str += taxname;
190         str += "</a>";
191     }
192     else {
193         str = taxname;
194     }
195 
196     TryToSanitizeHtml(str);
197 }
198 
FormatNucSearch(CNcbiOstream & os,const string & id) const199 void CHTMLFormatterEx::FormatNucSearch(CNcbiOstream& os, const string& id) const
200 {
201     os << "<a href=\"" << strLinkBaseNucSearch << id << "\">" << id << "</a>";
202 }
203 
FormatLocation(string & strLink,const CSeq_loc & loc,TIntId gi,const string & visible_text) const204 void CHTMLFormatterEx::FormatLocation(string& strLink, const CSeq_loc& loc, TIntId gi, const string& visible_text) const
205 {
206     // check if this is a protein or nucleotide link
207     bool is_prot = false;
208     {{
209         CBioseq_Handle bioseq_h;
210         ITERATE(CSeq_loc, loc_ci, loc) {
211             bioseq_h = m_scope->GetBioseqHandle(loc_ci.GetSeq_id());
212             if (bioseq_h) {
213                 break;
214             }
215         }
216         if (bioseq_h) {
217             is_prot = (bioseq_h.GetBioseqMolType() == CSeq_inst::eMol_aa);
218         }
219     }}
220 
221     // assembly of the actual string:
222     strLink.reserve(100); // euristical URL length
223 
224     strLink = "<a href=\"";
225 
226     // link base
227     if (is_prot) {
228         strLink += strLinkBaseProt;
229     }
230     else {
231         strLink += strLinkBaseNuc;
232     }
233     strLink += NStr::NumericToString(gi);
234 
235     // location
236     if (loc.IsInt() || loc.IsPnt()) {
237         TSeqPos iFrom = loc.GetStart(eExtreme_Positional) + 1;
238         TSeqPos iTo = loc.GetStop(eExtreme_Positional) + 1;
239         strLink += "?from=";
240         strLink += NStr::IntToString(iFrom);
241         strLink += "&amp;to=";
242         strLink += NStr::IntToString(iTo);
243     }
244     else if (visible_text != "Precursor") {
245         // TODO: this fails on URLs that require "?itemID=" (e.g. almost any, such as U54469)
246         strLink += "?itemid=TBD";
247     }
248 
249     strLink += "\">";
250     strLink += visible_text;
251     strLink += "</a>";
252 }
253 
FormatModelEvidence(string & str,const SModelEvidance & me) const254 void CHTMLFormatterEx::FormatModelEvidence(string& str, const SModelEvidance& me) const
255 {
256     str += "<a href=\"";
257     str += strLinkBaseNuc;
258     if (me.gi > ZERO_GI) {
259         str += NStr::NumericToString(me.gi);
260     }
261     else {
262         str += me.name;
263     }
264     str += "?report=graph";
265     if ((me.span.first >= 0) && (me.span.second >= me.span.first)) {
266         const Int8 kPadAmount = 500;
267         // The "+1" is because we display 1-based to user and in URL
268         str += "&v=";
269         str += NStr::NumericToString(max<Int8>(me.span.first + 1 - kPadAmount, 1));
270         str += ":";
271         str += NStr::NumericToString(me.span.second + 1 + kPadAmount); // okay if second number goes over end of sequence
272     }
273     str += "\">";
274     str += me.name;
275     str += "</a>";
276 }
277 
FormatTranscript(string & str,const string & name) const278 void CHTMLFormatterEx::FormatTranscript(string& str, const string& name) const
279 {
280     str += "<a href=\"";
281     str += strLinkBaseNuc;
282     str += name;
283     str += "\">";
284     str += name;
285     str += "</a>";
286 }
287 
FormatGeneralId(CNcbiOstream & os,const string & id) const288 void CHTMLFormatterEx::FormatGeneralId(CNcbiOstream& os, const string& id) const
289 {
290     os << "<a href=\"" << strLinkBaseNuc << id << "\">" << id << "</a>";
291 }
292 
FormatGapLink(CNcbiOstream & os,TSeqPos gap_size,const string & id,bool is_prot) const293 void CHTMLFormatterEx::FormatGapLink(CNcbiOstream& os, TSeqPos gap_size,
294                                      const string& id, bool is_prot) const
295 {
296     const string link_base = (is_prot ? strLinkBaseProt : strLinkBaseNuc);
297     const char *mol_type = (is_prot ? "aa" : "bp" );
298     os << "          [gap " << gap_size << " " << mol_type << "]" <<
299         "    <a href=\"" << link_base << id << "?expand-gaps=on\">Expand Ns</a>";
300 }
301 
FormatUniProtId(string & str,const string & prot_id) const302 void CHTMLFormatterEx::FormatUniProtId(string& str, const string& prot_id) const
303 {
304     str = "<a href=\"";
305     str += strLinkBaseUniProt;
306     str += prot_id;
307     str += "\">";
308     str += prot_id;
309     str += "</a>";
310 }
311 
312 class CAsn2FlatApp : public CNcbiApplication, public CGBReleaseFile::ISeqEntryHandler
313 {
314 public:
315     CAsn2FlatApp();
316     ~CAsn2FlatApp();
317 
318     void Init() override;
319     int Run() override;
320 
321     bool HandleSeqEntry(CRef<CSeq_entry>& se) override;
322     bool HandleSeqEntry(const CSeq_entry_Handle& seh);
323 
324 protected:
325     void HandleSeqSubmit(CObjectIStream& is );
326     void HandleSeqSubmit(CSeq_submit& sub);
327     void HandleSeqId(const string& id);
328 
329     CSeq_entry_Handle ObtainSeqEntryFromSeqEntry(CObjectIStream& is, bool report = false);
330     CSeq_entry_Handle ObtainSeqEntryFromBioseq(CObjectIStream& is, bool report = false);
331     CSeq_entry_Handle ObtainSeqEntryFromBioseqSet(CObjectIStream& is, bool report = false);
332 
333     CNcbiOstream* OpenFlatfileOstream(const string& name);
334 
335 private:
336     // types
337     typedef CFlatFileConfig::CGenbankBlockCallback TGenbankBlockCallback;
338 
339     CObjectIStream* x_OpenIStream(const CArgs& args);
340 
341     CFlatFileGenerator* x_CreateFlatFileGenerator(const CArgs& args);
342     TGenbankBlockCallback* x_GetGenbankCallback(const CArgs& args);
343     TSeqPos x_GetFrom(const CArgs& args);
344     TSeqPos x_GetTo  (const CArgs& args);
345     ENa_strand x_GetStrand(const CArgs& args);
346     void x_GetLocation(const CSeq_entry_Handle& entry,
347         const CArgs& args, CSeq_loc& loc);
348     CBioseq_Handle x_DeduceTarget(const CSeq_entry_Handle& entry);
349     void x_CreateCancelBenchmarkCallback();
350     int x_AddSNPAnnots(CBioseq_Handle& bsh);
351 
352     // data
353     CRef<CObjectManager>        m_Objmgr;       // Object Manager
354     CRef<CScope>                m_Scope;
355 
356     CNcbiOstream*               m_Os;           // all sequence output stream
357     bool                        m_OnlyNucs;
358     bool                        m_OnlyProts;
359 
360     CNcbiOstream*               m_On;           // nucleotide output stream
361     CNcbiOstream*               m_Og;           // genomic output stream
362     CNcbiOstream*               m_Or;           // RNA output stream
363     CNcbiOstream*               m_Op;           // protein output stream
364     CNcbiOstream*               m_Ou;           // unknown output stream
365 
366     CRef<CFlatFileGenerator>    m_FFGenerator;  // Flat-file generator
367     unique_ptr<ICanceled>         m_pCanceledCallback;
368     bool                        m_do_cleanup;
369     bool                        m_Exception;
370     bool                        m_FetchFail;
371     bool                        m_PSGMode;
372 #ifdef USE_SNPLOADER
373     CRef<CSNPDataLoader>        m_SNPDataLoader;
374     unique_ptr<ncbi::grpcapi::dbsnp::primary_track::DbSnpPrimaryTrack::Stub> m_SNPTrackStub;
375 #endif
376 #ifdef USE_CDDLOADER
377     CRef<CCDDDataLoader>        m_CDDDataLoader;
378 #endif
379 };
380 
381 
382 // constructor
CAsn2FlatApp()383 CAsn2FlatApp::CAsn2FlatApp()
384 {
385     const CVersionInfo vers (6, NCBI_SC_VERSION_PROXY, NCBI_TEAMCITY_BUILD_NUMBER_PROXY);
386     SetVersion (vers);
387 }
388 
389 // destructor
~CAsn2FlatApp()390 CAsn2FlatApp::~CAsn2FlatApp()
391 {
392 }
393 
Init()394 void CAsn2FlatApp::Init()
395 {
396     unique_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
397     arg_desc->SetUsageContext("", "Convert an ASN.1 Seq-entry into a flat report");
398 
399     // input
400     {{
401          arg_desc->SetCurrentGroup("Input/Output Options");
402          // name
403          arg_desc->AddOptionalKey("i", "InputFile",
404                                   "Input file name",
405                                   CArgDescriptions::eInputFile);
406 
407          // input file serial format (AsnText\AsnBinary\XML, default: AsnText)
408          arg_desc->AddOptionalKey("serial", "SerialFormat", "Input file format",
409                                   CArgDescriptions::eString);
410          arg_desc->SetConstraint("serial", &(*new CArgAllow_Strings,
411                                              "text", "binary", "XML"));
412          arg_desc->AddFlag("sub", "Submission");
413          // id
414          arg_desc->AddOptionalKey("id", "ID",
415                                   "Specific ID to display",
416                                   CArgDescriptions::eString);
417          arg_desc->AddOptionalKey("ids", "IDFile",
418                                   "FIle of IDs to display, one per line",
419                                   CArgDescriptions::eInputFile);
420 
421          // input type:
422          arg_desc->AddDefaultKey( "type", "AsnType", "ASN.1 object type",
423                                   CArgDescriptions::eString, "any" );
424          arg_desc->SetConstraint( "type",
425                                   &( *new CArgAllow_Strings, "any", "seq-entry", "bioseq", "bioseq-set", "seq-submit" ) );
426 
427         // single output name
428         arg_desc->AddOptionalKey("o", "SingleOutputFile",
429             "Single output file name", CArgDescriptions::eOutputFile);
430 
431         // file names
432         arg_desc->AddOptionalKey("on", "NucleotideOutputFile",
433             "Nucleotide output file name", CArgDescriptions::eOutputFile);
434         arg_desc->SetDependency("on", CArgDescriptions::eExcludes, "o");
435 
436         arg_desc->AddOptionalKey("og", "GenomicOutputFile",
437             "Genomic output file name", CArgDescriptions::eOutputFile);
438         arg_desc->SetDependency("og", CArgDescriptions::eExcludes, "o");
439         arg_desc->SetDependency("og", CArgDescriptions::eExcludes, "on");
440 
441         arg_desc->AddOptionalKey("or", "RNAOutputFile",
442             "RNA output file name", CArgDescriptions::eOutputFile);
443         arg_desc->SetDependency("or", CArgDescriptions::eExcludes, "o");
444         arg_desc->SetDependency("or", CArgDescriptions::eExcludes, "on");
445 
446         arg_desc->AddOptionalKey("op", "ProteinOutputFile",
447             "Protein output file name", CArgDescriptions::eOutputFile);
448         arg_desc->SetDependency("op", CArgDescriptions::eExcludes, "o");
449 
450         arg_desc->AddOptionalKey("ou", "UnknownOutputFile",
451             "Unknown output file name", CArgDescriptions::eOutputFile);
452         arg_desc->SetDependency("ou", CArgDescriptions::eExcludes, "o");
453      }}
454 
455     // batch processing
456     {{
457          arg_desc->SetCurrentGroup("Batch Processing Options");
458          arg_desc->AddFlag("batch", "Process NCBI release file");
459          // compression
460          arg_desc->AddFlag("c", "Compressed file");
461          // propogate top descriptors
462          arg_desc->AddFlag("p", "Propagate top descriptors");
463      }}
464 
465     // in flat_file_config.cpp
466     CFlatFileConfig::AddArgumentDescriptions(*arg_desc);
467 
468      // debugging options
469      {{
470          arg_desc->SetCurrentGroup("Debugging Options - Subject to change or removal without warning");
471 
472          // benchmark cancel-checking
473          arg_desc->AddFlag(
474              "benchmark-cancel-checking",
475              "Check statistics on how often the flatfile generator checks if "
476              "it should be canceled.  This also sets up SIGUSR1 to trigger "
477              "cancellation." );
478      }}
479 
480      CDataLoadersUtil::AddArgumentDescriptions(*arg_desc);
481 
482      arg_desc->SetCurrentGroup(kEmptyStr);
483      SetupArgDescriptions(arg_desc.release());
484 }
485 
486 
OpenFlatfileOstream(const string & name)487 CNcbiOstream* CAsn2FlatApp::OpenFlatfileOstream(const string& name)
488 {
489     const CArgs& args = GetArgs();
490 
491     if (! args[name])
492         return nullptr;
493 
494     CNcbiOstream* flatfile_os = &(args[name].AsOutputFile());
495 
496     // so we don't fail silently if, e.g., the output disk gets full
497     flatfile_os->exceptions( ios::failbit | ios::badbit );
498 
499     return flatfile_os;
500 }
501 
s_INSDSetOpen(bool is_insdseq,CNcbiOstream * os)502 static void s_INSDSetOpen(bool is_insdseq, CNcbiOstream* os) {
503 
504     if (is_insdseq) {
505         *os << "<INSDSet>" << endl;
506     }
507 }
508 
s_INSDSetClose(bool is_insdseq,CNcbiOstream * os)509 static void s_INSDSetClose(bool is_insdseq, CNcbiOstream* os) {
510 
511     if (is_insdseq) {
512         *os << "</INSDSet>" << endl;
513     }
514 }
515 
516 
Run()517 int CAsn2FlatApp::Run()
518 {
519     CTime expires = GetFullVersion().GetBuildInfo().GetBuildTime();
520     if (!expires.IsEmpty())
521     {
522         expires.AddYear();
523         if (CTime(CTime::eCurrent) > expires)
524         {
525             NcbiCerr << "This copy of " << GetProgramDisplayName()
526                      << " is more than 1 year old. Please download the current version if it is newer." << endl;
527         }
528     }
529 
530     m_Exception = false;
531     m_FetchFail = false;
532 
533     // initialize conn library
534     CONNECT_Init(&GetConfig());
535 
536     const CArgs& args = GetArgs();
537 
538     CSignal::TrapSignals(CSignal::eSignal_USR1);
539 
540     // create object manager
541     m_Objmgr = CObjectManager::GetInstance();
542     if ( !m_Objmgr ) {
543         NCBI_THROW(CException, eUnknown, "Could not create object manager");
544     }
545     if (args["gbload"] || args["id"] || args["ids"]) {
546         static const CDataLoadersUtil::TLoaders default_loaders =
547             CDataLoadersUtil::fGenbank | CDataLoadersUtil::fVDB | CDataLoadersUtil::fSRA;
548         CDataLoadersUtil::SetupObjectManager(args, *m_Objmgr, default_loaders);
549 
550         if (( args["enable-external"] && ! args["no-external"] ) || args["policy"].AsString() == "external" ) {
551             CGBDataLoader* gb_loader = dynamic_cast<CGBDataLoader*>(CObjectManager::GetInstance()->FindDataLoader("GBLOADER"));
552             if (gb_loader) {
553                 // needed to find remote features when reading local ASN.1 file
554                 gb_loader->CGBDataLoader::SetAlwaysLoadExternal(true);
555             }
556         }
557     }
558 
559     m_Scope.Reset(new CScope(*m_Objmgr));
560     m_Scope->AddDefaults();
561 
562     const CNcbiRegistry& cfg = CNcbiApplication::Instance()->GetConfig();
563     m_PSGMode = cfg.GetBool("genbank", "loader_psg", false, 0, CNcbiRegistry::eReturn);
564     if (m_PSGMode) {
565 #ifdef USE_SNPLOADER
566         string host = cfg.GetString("SNPAccess", "host", "");
567         string port = cfg.GetString("SNPAccess", "port", "");
568         string hostport = host + ":" + port;
569 
570         auto channel =
571             grpc::CreateChannel(hostport, grpc::InsecureChannelCredentials());
572         m_SNPTrackStub =
573             ncbi::grpcapi::dbsnp::primary_track::DbSnpPrimaryTrack::NewStub(channel);
574         CRef<CSNPDataLoader> snp_data_loader(CSNPDataLoader::RegisterInObjectManager(*m_Objmgr).GetLoader());
575         m_Scope->AddDataLoader(snp_data_loader->GetLoaderNameFromArgs());
576 #endif
577 #ifdef USE_CDDLOADER
578         bool use_mongo_cdd =
579             cfg.GetBool("genbank", "vdb_cdd", false, 0, CNcbiRegistry::eReturn) &&
580             cfg.GetBool("genbank", "always_load_external", false, 0, CNcbiRegistry::eReturn);
581         if (use_mongo_cdd) {
582             CRef<CCDDDataLoader> cdd_data_loader(CCDDDataLoader::RegisterInObjectManager(*m_Objmgr).GetLoader());
583             m_Scope->AddDataLoader(cdd_data_loader->GetLoaderNameFromArgs());
584         }
585 #endif
586     }
587 
588     // open the output streams
589     m_Os = OpenFlatfileOstream ("o");
590     m_On = OpenFlatfileOstream ("on");
591     m_Og = OpenFlatfileOstream ("og");
592     m_Or = OpenFlatfileOstream ("or");
593     m_Op = OpenFlatfileOstream ("op");
594     m_Ou = OpenFlatfileOstream ("ou");
595 
596     m_OnlyNucs = false;
597     m_OnlyProts = false;
598     if (m_Os) {
599         const string& view = args["view"].AsString();
600         m_OnlyNucs = (view == "nuc");
601         m_OnlyProts = (view == "prot");
602     }
603 
604     if (!m_Os && !m_On && !m_Og && !m_Or && !m_Op && !m_Ou) {
605         // No output (-o*) argument given - default to stdout
606         m_Os = &cout;
607     }
608 
609     bool is_insdseq = false;
610     if (args["format"].AsString() == "insdseq") {
611         // only print <INSDSet> ... </INSDSet> wrappers if single output stream
612         if (m_Os) {
613             is_insdseq = true;
614         }
615     }
616 
617     // create the flat-file generator
618     m_FFGenerator.Reset(x_CreateFlatFileGenerator(args));
619     if ( args["no-external"] || args["policy"].AsString() == "internal" ) {
620         m_FFGenerator->SetAnnotSelector().SetExcludeExternal(true);
621     }
622 //    else if (!m_Scope->GetKeepExternalAnnotsForEdit()) {
623 //       m_Scope->SetKeepExternalAnnotsForEdit();
624 //    }
625     if( args["resolve-all"] || args["policy"].AsString() == "external") {
626         m_FFGenerator->SetAnnotSelector().SetResolveAll();
627     }
628     if( args["depth"] ) {
629         m_FFGenerator->SetAnnotSelector().SetResolveDepth(args["depth"].AsInteger());
630     }
631     if( args["max_search_segments"] ) {
632         m_FFGenerator->SetAnnotSelector().SetMaxSearchSegments(args["max_search_segments"].AsInteger());
633     }
634     if( args["max_search_time"] ) {
635         m_FFGenerator->SetAnnotSelector().SetMaxSearchTime(float(args["max_search_time"].AsDouble()));
636     }
637 
638     unique_ptr<CObjectIStream> is;
639     is.reset( x_OpenIStream( args ) );
640     if (!is) {
641         string msg = args["i"]? "Unable to open input file" + args["i"].AsString() :
642             "Unable to read data from stdin";
643         NCBI_THROW(CException, eUnknown, msg);
644     }
645 
646     if ( args[ "sub" ] ) {
647         s_INSDSetOpen ( is_insdseq, m_Os );
648         HandleSeqSubmit( *is );
649         s_INSDSetClose ( is_insdseq, m_Os );
650         if (m_Exception) return -1;
651         return 0;
652     }
653 
654     if ( args[ "batch" ] ) {
655         s_INSDSetOpen ( is_insdseq, m_Os );
656         bool propagate = args[ "p" ];
657         CGBReleaseFile in( *is.release(), propagate );
658         in.RegisterHandler( this );
659         in.Read();  // HandleSeqEntry will be called from this function
660         s_INSDSetClose ( is_insdseq, m_Os );
661         if (m_Exception) return -1;
662         return 0;
663     }
664 
665     if ( args[ "ids" ] ) {
666         s_INSDSetOpen ( is_insdseq, m_Os );
667         CNcbiIstream& istr = args["ids"].AsInputFile();
668         string id_str;
669         while (NcbiGetlineEOL(istr, id_str)) {
670             id_str = NStr::TruncateSpaces(id_str);
671             if (id_str.empty() || id_str[0] == '#') {
672                 continue;
673             }
674 
675             try {
676                 LOG_POST(Error << "id = " << id_str);
677                 HandleSeqId( id_str );
678             }
679             catch (CException& e) {
680                 ERR_POST(Error << e);
681             }
682         }
683         s_INSDSetClose ( is_insdseq, m_Os );
684         if (m_Exception) return -1;
685         return 0;
686     }
687 
688     if ( args[ "id" ] ) {
689         s_INSDSetOpen ( is_insdseq, m_Os );
690         HandleSeqId( args[ "id" ].AsString() );
691         s_INSDSetClose ( is_insdseq, m_Os );
692         if (m_Exception) return -1;
693         return 0;
694     }
695 
696     string asn_type = args["type"].AsString();
697 
698     s_INSDSetOpen ( is_insdseq, m_Os );
699     if ( asn_type == "seq-entry" ) {
700         //
701         //  Straight through processing: Read a seq_entry, then process
702         //  a seq_entry:
703         //
704         while ( !is->EndOfData() ) {
705             CSeq_entry_Handle seh = ObtainSeqEntryFromSeqEntry(*is, true);
706             if ( !seh ) {
707                 NCBI_THROW(CException, eUnknown,
708                            "Unable to construct Seq-entry object" );
709             }
710             HandleSeqEntry(seh);
711             m_Scope->RemoveTopLevelSeqEntry(seh);
712         }
713     }
714     else if ( asn_type == "bioseq" ) {
715         //
716         //  Read object as a bioseq, wrap it into a seq_entry, then process
717         //  the wrapped bioseq as a seq_entry:
718         //
719         while ( !is->EndOfData() ) {
720             CSeq_entry_Handle seh = ObtainSeqEntryFromBioseq(*is, true);
721             if ( !seh ) {
722                 NCBI_THROW(CException, eUnknown,
723                            "Unable to construct Seq-entry object" );
724             }
725             HandleSeqEntry(seh);
726             m_Scope->RemoveTopLevelSeqEntry(seh);
727         }
728     }
729     else if ( asn_type == "bioseq-set" ) {
730         //
731         //  Read object as a bioseq_set, wrap it into a seq_entry, then
732         //  process the wrapped bioseq_set as a seq_entry:
733         //
734         while ( !is->EndOfData() ) {
735             CSeq_entry_Handle seh = ObtainSeqEntryFromBioseqSet(*is, true);
736             if ( !seh ) {
737                 NCBI_THROW(CException, eUnknown,
738                            "Unable to construct Seq-entry object" );
739             }
740             HandleSeqEntry(seh);
741             m_Scope->RemoveTopLevelSeqEntry(seh);
742         }
743     }
744     else if ( asn_type == "seq-submit" ) {
745         while ( !is->EndOfData() ) {
746             HandleSeqSubmit( *is );
747         }
748     }
749     else if ( asn_type == "any" ) {
750         //
751         //  Try the first four in turn:
752         //
753         while ( !is->EndOfData() ) {
754             string strNextTypeName = is->PeekNextTypeName();
755 
756             CSeq_entry_Handle seh = ObtainSeqEntryFromSeqEntry(*is);
757             if ( !seh ) {
758                 is->Close();
759                 is.reset( x_OpenIStream( args ) );
760                 seh = ObtainSeqEntryFromBioseqSet(*is);
761                 if ( !seh ) {
762                     is->Close();
763                     is.reset( x_OpenIStream( args ) );
764                     seh = ObtainSeqEntryFromBioseq(*is);
765                     if ( !seh ) {
766                         is->Close();
767                         is.reset( x_OpenIStream( args ) );
768                         CRef<CSeq_submit> sub(new CSeq_submit);
769                         *is >> *sub;
770                         if (sub->IsSetSub() && sub->IsSetData()) {
771                             HandleSeqSubmit(*sub);
772                             if (m_Exception) return -1;
773                             return 0;
774                         } else {
775                             NCBI_THROW(
776                                        CException, eUnknown,
777                                        "Unable to construct Seq-entry object"
778                                       );
779                         }
780                     }
781                 }
782             }
783             HandleSeqEntry(seh);
784             m_Scope->RemoveTopLevelSeqEntry(seh);
785         }
786     }
787     s_INSDSetClose ( is_insdseq, m_Os );
788 
789     if (m_Exception) return -1;
790     return 0;
791 }
792 
793 
HandleSeqSubmit(CSeq_submit & sub)794 void CAsn2FlatApp::HandleSeqSubmit(CSeq_submit& sub)
795 {
796     if (!sub.IsSetSub() || !sub.IsSetData() || !sub.GetData().IsEntrys()) {
797         return;
798     }
799     CRef<CScope> scope(new CScope(*m_Objmgr));
800     scope->AddDefaults();
801     if (m_do_cleanup) {
802         CCleanup cleanup;
803         cleanup.BasicCleanup(sub);
804     }
805     const CArgs& args = GetArgs();
806     if (args["from"] || args["to"] || args["strand"]) {
807         CRef<CSeq_entry> e(sub.SetData().SetEntrys().front());
808         CSeq_entry_Handle seh;
809         try {
810             seh = scope->GetSeq_entryHandle(*e);
811         } catch (CException&) {}
812 
813         if (!seh) {  // add to scope if not already in it
814             seh = scope->AddTopLevelSeqEntry(*e);
815         }
816         // "remember" the submission block
817         m_FFGenerator->SetSubmit(sub.GetSub());
818         CSeq_loc loc;
819         x_GetLocation(seh, args, loc);
820         try {
821             m_FFGenerator->Generate(loc, seh.GetScope(), *m_Os);
822         }
823         catch (CException& e) {
824             ERR_POST(Error << e);
825             m_Exception = true;
826         }
827     } else {
828         try {
829             m_FFGenerator->Generate(sub, *scope, *m_Os);
830         }
831         catch (CException& e) {
832             ERR_POST(Error << e);
833             m_Exception = true;
834         }
835     }
836 }
837 
838 
839 //  ============================================================================
HandleSeqSubmit(CObjectIStream & is)840 void CAsn2FlatApp::HandleSeqSubmit(CObjectIStream& is )
841 //  ============================================================================
842 {
843     CRef<CSeq_submit> sub(new CSeq_submit);
844     is >> *sub;
845     HandleSeqSubmit(*sub);
846 }
847 
848 //  ============================================================================
HandleSeqId(const string & strId)849 void CAsn2FlatApp::HandleSeqId(
850     const string& strId )
851 //  ============================================================================
852 {
853     CSeq_entry_Handle seh;
854 
855     // This C++-scope gets a raw CSeq_entry that has no attachment
856     // to any CScope and puts it into entry.
857     {
858         CSeq_id id(strId);
859         CBioseq_Handle bsh = m_Scope->GetBioseqHandle( id );
860         if ( ! bsh ) {
861             NCBI_THROW(
862                 CException, eUnknown,
863                 "Unable to retrieve data for the given ID"
864                 );
865         }
866         seh = bsh.GetParentEntry();
867         /*
868         CConstRef<CSeq_entry> ser = seh.GetTopLevelEntry().GetCompleteSeq_entry();
869         if (ser) {
870             cout << MSerial_AsnText << *ser << endl;
871         }
872         */
873     }
874 
875     //
876     //  ... and use that to generate the flat file:
877     //
878     HandleSeqEntry( seh );
879 }
880 
881 //  ============================================================================
HandleSeqEntry(const CSeq_entry_Handle & seh)882 bool CAsn2FlatApp::HandleSeqEntry(const CSeq_entry_Handle& seh )
883 //  ============================================================================
884 {
885     const CArgs& args = GetArgs();
886 
887     if ( m_do_cleanup ) {
888         CSeq_entry_EditHandle tseh = seh.GetTopLevelEntry().GetEditHandle();
889         CBioseq_set_EditHandle bseth;
890         CBioseq_EditHandle bseqh;
891         CRef<CSeq_entry> tmp_se(new CSeq_entry);
892         if ( tseh.IsSet() ) {
893             bseth = tseh.SetSet();
894             CConstRef<CBioseq_set> bset = bseth.GetCompleteObject();
895             bseth.Remove(bseth.eKeepSeq_entry);
896             tmp_se->SetSet(const_cast<CBioseq_set&>(*bset));
897         }
898         else {
899             bseqh = tseh.SetSeq();
900             CConstRef<CBioseq> bseq = bseqh.GetCompleteObject();
901             bseqh.Remove(bseqh.eKeepSeq_entry);
902             tmp_se->SetSeq(const_cast<CBioseq&>(*bseq));
903         }
904 
905         CCleanup cleanup;
906         cleanup.BasicCleanup( *tmp_se );
907 
908         if ( tmp_se->IsSet() ) {
909             tseh.SelectSet(bseth);
910         }
911         else {
912             tseh.SelectSeq(bseqh);
913         }
914     }
915 
916     if ( args["faster"] || args["policy"].AsString() == "ftp" || args["policy"].AsString() == "web" ) {
917 
918             try {
919                 if ( args["from"] || args["to"] || args["strand"] ) {
920                     CSeq_loc loc;
921                     x_GetLocation( seh, args, loc );
922                     CNcbiOstream* flatfile_os = m_Os;
923                     m_FFGenerator->Generate(loc, seh.GetScope(), *flatfile_os, true, m_Os);
924                 } else {
925                     CNcbiOstream* flatfile_os = m_Os;
926                     m_FFGenerator->Generate( seh, *flatfile_os, true, m_Os, m_On, m_Og, m_Or, m_Op, m_Ou);
927                 }
928             }
929             catch (CException& e) {
930                   ERR_POST(Error << e);
931                   m_Exception = true;
932             }
933 
934         return true;
935     }
936 
937     m_FFGenerator->SetFeatTree(new feature::CFeatTree(seh));
938 
939     for (CBioseq_CI bioseq_it(seh);  bioseq_it;  ++bioseq_it) {
940         CBioseq_Handle bsh = *bioseq_it;
941         CConstRef<CBioseq> bsr = bsh.GetCompleteBioseq();
942 
943         CNcbiOstream* flatfile_os = nullptr;
944 
945         bool is_genomic = false;
946         bool is_RNA = false;
947 
948         CConstRef<CSeqdesc> closest_molinfo
949             = bsr->GetClosestDescriptor(CSeqdesc::e_Molinfo);
950         if (closest_molinfo) {
951             const CMolInfo& molinf = closest_molinfo->GetMolinfo();
952             CMolInfo::TBiomol biomol = molinf.GetBiomol();
953             switch (biomol) {
954                 case NCBI_BIOMOL(genomic):
955                 case NCBI_BIOMOL(other_genetic):
956                 case NCBI_BIOMOL(genomic_mRNA):
957                 case NCBI_BIOMOL(cRNA):
958                     is_genomic = true;
959                     break;
960                 case NCBI_BIOMOL(pre_RNA):
961                 case NCBI_BIOMOL(mRNA):
962                 case NCBI_BIOMOL(rRNA):
963                 case NCBI_BIOMOL(tRNA):
964                 case NCBI_BIOMOL(snRNA):
965                 case NCBI_BIOMOL(scRNA):
966                 case NCBI_BIOMOL(snoRNA):
967                 case NCBI_BIOMOL(transcribed_RNA):
968                 case NCBI_BIOMOL(ncRNA):
969                 case NCBI_BIOMOL(tmRNA):
970                     is_RNA = true;
971                     break;
972                 case NCBI_BIOMOL(other):
973                     {
974                         CBioseq_Handle::TMol mol = bsh.GetSequenceType();
975                         switch (mol) {
976                              case CSeq_inst::eMol_dna:
977                                 is_genomic = true;
978                                 break;
979                             case CSeq_inst::eMol_rna:
980                                 is_RNA = true;
981                                 break;
982                             case CSeq_inst::eMol_na:
983                                 is_genomic = true;
984                                 break;
985                             default:
986                                 break;
987                        }
988                    }
989                    break;
990                 default:
991                     break;
992             }
993         }
994 
995         if ( m_Os ) {
996             if ( m_OnlyNucs && ! bsh.IsNa() ) continue;
997             if ( m_OnlyProts && ! bsh.IsAa() ) continue;
998             flatfile_os = m_Os;
999         } else if ( bsh.IsNa() ) {
1000             if ( m_On ) {
1001                 flatfile_os = m_On;
1002             } else if ( (is_genomic || ! closest_molinfo) && m_Og ) {
1003                 flatfile_os = m_Og;
1004             } else if ( is_RNA && m_Or ) {
1005                 flatfile_os = m_Or;
1006             } else {
1007                 continue;
1008             }
1009         } else if ( bsh.IsAa() ) {
1010             if ( m_Op ) {
1011                 flatfile_os = m_Op;
1012             }
1013         } else {
1014             if ( m_Ou ) {
1015                 flatfile_os = m_Ou;
1016             } else if ( m_On ) {
1017                 flatfile_os = m_On;
1018             } else {
1019                 continue;
1020             }
1021         }
1022 
1023         if ( !flatfile_os ) continue;
1024 
1025         if (m_PSGMode && ( args["enable-external"] || args["policy"].AsString() == "external" ))
1026             x_AddSNPAnnots(bsh);
1027 
1028         // generate flat file
1029         if ( args["from"] || args["to"] || args["strand"] ) {
1030             CSeq_loc loc;
1031             x_GetLocation( seh, args, loc );
1032             try {
1033                 m_FFGenerator->Generate(loc, seh.GetScope(), *flatfile_os);
1034             }
1035             catch (CException& e) {
1036                 ERR_POST(Error << e);
1037                 m_Exception = true;
1038             }
1039             // emulate the C Toolkit: only produce flatfile for first sequence
1040             // when range is specified
1041             return true;
1042         }
1043         else {
1044             int count = args["count"].AsInteger();
1045             for ( int i = 0; i < count; ++i ) {
1046                 try {
1047                     m_FFGenerator->Generate( bsh, *flatfile_os);
1048                 }
1049                 catch (CException& e) {
1050                     ERR_POST(Error << e);
1051                     m_Exception = true;
1052                 }
1053             }
1054 
1055         }
1056     }
1057     return true;
1058 }
1059 
ObtainSeqEntryFromSeqEntry(CObjectIStream & is,bool report)1060 CSeq_entry_Handle CAsn2FlatApp::ObtainSeqEntryFromSeqEntry(CObjectIStream& is, bool report)
1061 {
1062     try {
1063         CRef<CSeq_entry> se(new CSeq_entry);
1064         is >> *se;
1065         if (se->Which() == CSeq_entry::e_not_set) {
1066             NCBI_THROW(CException, eUnknown,
1067                        "provided Seq-entry is empty");
1068         }
1069         return m_Scope->AddTopLevelSeqEntry(*se);
1070     }
1071     catch (CException& e) {
1072         if (report) {
1073             ERR_POST(Error << e);
1074         }
1075     }
1076     return CSeq_entry_Handle();
1077 }
1078 
ObtainSeqEntryFromBioseq(CObjectIStream & is,bool report)1079 CSeq_entry_Handle CAsn2FlatApp::ObtainSeqEntryFromBioseq(CObjectIStream& is, bool report)
1080 {
1081     try {
1082         CRef<CBioseq> bs(new CBioseq);
1083         is >> *bs;
1084         CBioseq_Handle bsh = m_Scope->AddBioseq(*bs);
1085         return bsh.GetTopLevelEntry();
1086     }
1087     catch (CException& e) {
1088         if (report) {
1089             ERR_POST(Error << e);
1090         }
1091     }
1092     return CSeq_entry_Handle();
1093 }
1094 
ObtainSeqEntryFromBioseqSet(CObjectIStream & is,bool report)1095 CSeq_entry_Handle CAsn2FlatApp::ObtainSeqEntryFromBioseqSet(CObjectIStream& is, bool report)
1096 {
1097     try {
1098         CRef<CSeq_entry> entry(new CSeq_entry);
1099         is >> entry->SetSet();
1100         return m_Scope->AddTopLevelSeqEntry(*entry);
1101     }
1102     catch (CException& e) {
1103         if (report) {
1104             ERR_POST(Error << e);
1105         }
1106     }
1107     return CSeq_entry_Handle();
1108 }
1109 
1110 
HandleSeqEntry(CRef<CSeq_entry> & se)1111 bool CAsn2FlatApp::HandleSeqEntry(CRef<CSeq_entry>& se)
1112 {
1113     if (!se) {
1114         return false;
1115     }
1116 
1117     // add entry to scope
1118     CSeq_entry_Handle entry = m_Scope->AddTopLevelSeqEntry(*se);
1119     if ( !entry ) {
1120         NCBI_THROW(CException, eUnknown, "Failed to insert entry to scope.");
1121     }
1122 
1123     bool ret = HandleSeqEntry(entry);
1124     // Needed because we can really accumulate a lot of junk otherwise,
1125     // and we end up with significant slowdown due to repeatedly doing
1126     // linear scans on a growing CScope.
1127     m_Scope->ResetDataAndHistory();
1128     return ret;
1129 }
1130 
x_OpenIStream(const CArgs & args)1131 CObjectIStream* CAsn2FlatApp::x_OpenIStream(const CArgs& args)
1132 {
1133     // determine the file serialization format.
1134     // default for batch files is binary, otherwise text.
1135     ESerialDataFormat serial = args["batch"] ? eSerial_AsnBinary : eSerial_AsnText;
1136     if ( args["serial"] ) {
1137         const string& val = args["serial"].AsString();
1138         if ( val == "text" ) {
1139             serial = eSerial_AsnText;
1140         } else if ( val == "binary" ) {
1141             serial = eSerial_AsnBinary;
1142         } else if ( val == "XML" ) {
1143             serial = eSerial_Xml;
1144         }
1145     }
1146 
1147     // make sure of the underlying input stream. If -i was given on the command line
1148     // then the input comes from a file. Otherwise, it comes from stdin:
1149     CNcbiIstream* pInputStream = &NcbiCin;
1150     bool bDeleteOnClose = false;
1151     if ( args["i"] ) {
1152         pInputStream = new CNcbiIfstream( args["i"].AsString(), ios::binary  );
1153         bDeleteOnClose = true;
1154     }
1155 
1156     // if -c was specified then wrap the input stream into a gzip decompressor before
1157     // turning it into an object stream:
1158     CObjectIStream* pI = nullptr;
1159     if ( args["c"] ) {
1160         CZipStreamDecompressor* pDecompressor = new CZipStreamDecompressor(
1161             512, 512, kZlibDefaultWbits, CZipCompression::fCheckFileHeader );
1162         CCompressionIStream* pUnzipStream = new CCompressionIStream(
1163             *pInputStream, pDecompressor, CCompressionIStream::fOwnProcessor );
1164         pI = CObjectIStream::Open( serial, *pUnzipStream, eTakeOwnership );
1165     }
1166     else {
1167         pI = CObjectIStream::Open(
1168             serial, *pInputStream, (bDeleteOnClose ? eTakeOwnership : eNoOwnership));
1169     }
1170 
1171     if ( pI ) {
1172         pI->UseMemoryPool();
1173     }
1174     return pI;
1175 }
1176 
1177 
x_CreateFlatFileGenerator(const CArgs & args)1178 CFlatFileGenerator* CAsn2FlatApp::x_CreateFlatFileGenerator(const CArgs& args)
1179 {
1180     CFlatFileConfig cfg;
1181     cfg.FromArguments(args);
1182     m_do_cleanup = ( ! args["nocleanup"]);
1183     cfg.BasicCleanup(false);
1184 
1185     if (args["html"])
1186     {
1187         CRef<IHTMLFormatter> html_fmt(new CHTMLFormatterEx(m_Scope));
1188         cfg.SetHTMLFormatter(html_fmt);
1189     }
1190 
1191     CRef<TGenbankBlockCallback> genbank_callback( x_GetGenbankCallback(args) );
1192 
1193     if( args["benchmark-cancel-checking"] ) {
1194         x_CreateCancelBenchmarkCallback();
1195     }
1196 
1197     //CFlatFileConfig cfg(
1198     //    format, mode, style, flags, view, gff_options, genbank_blocks,
1199     //    genbank_callback.GetPointerOrNull(), m_pCanceledCallback.get(),
1200     //    args["cleanup"] );
1201     CFlatFileGenerator* generator = new CFlatFileGenerator(cfg);
1202 
1203     // ID-5865 : SNP annotations must be explicitly added to the annot selector
1204     if (!m_PSGMode && cfg.ShowSNPFeatures()) {
1205         cfg.SetHideSNPFeatures(false);
1206         generator->SetAnnotSelector().IncludeNamedAnnotAccession("SNP");
1207     }
1208 
1209     return generator;
1210 }
1211 
1212 CAsn2FlatApp::TGenbankBlockCallback*
x_GetGenbankCallback(const CArgs & args)1213 CAsn2FlatApp::x_GetGenbankCallback(const CArgs& args)
1214 {
1215     class CSimpleCallback : public TGenbankBlockCallback {
1216     public:
1217         CSimpleCallback() { }
1218         ~CSimpleCallback() override
1219         {
1220             cerr << endl;
1221             cerr << "GENBANK CALLBACK DEMO STATISTICS" << endl;
1222             cerr << endl;
1223             cerr << "Appearances of each type: " << endl;
1224             cerr << endl;
1225             x_DumpTypeToCountMap(m_TypeAppearancesMap);
1226             cerr << endl;
1227             cerr << "Total characters output for each type: " << endl;
1228             cerr << endl;
1229             x_DumpTypeToCountMap(m_TypeCharsMap);
1230             cerr << endl;
1231             cerr << "Average characters output for each type: " << endl;
1232             cerr << endl;
1233             x_PrintAverageStats();
1234         }
1235 
1236         EBioseqSkip notify_bioseq( CBioseqContext & ctx ) override
1237         {
1238             cerr << "notify_bioseq called." << endl;
1239             return eBioseqSkip_No;
1240         }
1241 
1242         // this macro is the lesser evil compared to the messiness that
1243         // you would see here otherwise. (plus it's less error-prone)
1244 #define SIMPLE_CALLBACK_NOTIFY(TItemClass) \
1245         EAction notify( string & block_text, \
1246                         const CBioseqContext& ctx, \
1247                         const TItemClass & item ) override { \
1248         NStr::ReplaceInPlace(block_text, "MODIFY TEST", "WAS MODIFIED TEST" ); \
1249         cerr << #TItemClass << " {" << block_text << '}' << endl; \
1250         ++m_TypeAppearancesMap[#TItemClass]; \
1251         ++m_TypeAppearancesMap["TOTAL"]; \
1252         m_TypeCharsMap[#TItemClass] += block_text.length(); \
1253         m_TypeCharsMap["TOTAL"] += block_text.length(); \
1254         EAction eAction = eAction_Default; \
1255         if( block_text.find("SKIP TEST")  != string::npos ) { \
1256             eAction = eAction_Skip; \
1257         } \
1258         if ( block_text.find("HALT TEST") != string::npos ) { \
1259             eAction = eAction_HaltFlatfileGeneration;         \
1260         } \
1261         return eAction; }
1262 
1263         SIMPLE_CALLBACK_NOTIFY(CStartSectionItem);
1264         SIMPLE_CALLBACK_NOTIFY(CHtmlAnchorItem);
1265         SIMPLE_CALLBACK_NOTIFY(CLocusItem);
1266         SIMPLE_CALLBACK_NOTIFY(CDeflineItem);
1267         SIMPLE_CALLBACK_NOTIFY(CAccessionItem);
1268         SIMPLE_CALLBACK_NOTIFY(CVersionItem);
1269         SIMPLE_CALLBACK_NOTIFY(CGenomeProjectItem);
1270         SIMPLE_CALLBACK_NOTIFY(CDBSourceItem);
1271         SIMPLE_CALLBACK_NOTIFY(CKeywordsItem);
1272         SIMPLE_CALLBACK_NOTIFY(CSegmentItem);
1273         SIMPLE_CALLBACK_NOTIFY(CSourceItem);
1274         SIMPLE_CALLBACK_NOTIFY(CReferenceItem);
1275         SIMPLE_CALLBACK_NOTIFY(CCommentItem);
1276         SIMPLE_CALLBACK_NOTIFY(CPrimaryItem);
1277         SIMPLE_CALLBACK_NOTIFY(CFeatHeaderItem);
1278         SIMPLE_CALLBACK_NOTIFY(CSourceFeatureItem);
1279         SIMPLE_CALLBACK_NOTIFY(CFeatureItem);
1280         SIMPLE_CALLBACK_NOTIFY(CGapItem);
1281         SIMPLE_CALLBACK_NOTIFY(CBaseCountItem);
1282         SIMPLE_CALLBACK_NOTIFY(COriginItem);
1283         SIMPLE_CALLBACK_NOTIFY(CSequenceItem);
1284         SIMPLE_CALLBACK_NOTIFY(CContigItem);
1285         SIMPLE_CALLBACK_NOTIFY(CWGSItem);
1286         SIMPLE_CALLBACK_NOTIFY(CTSAItem);
1287         SIMPLE_CALLBACK_NOTIFY(CEndSectionItem);
1288 #undef SIMPLE_CALLBACK_NOTIFY
1289 
1290     private:
1291         typedef map<string, size_t> TTypeToCountMap;
1292         // for each type, how many instances of that type did we see?
1293         // We use the special string "TOTAL" for a total count.
1294         TTypeToCountMap m_TypeAppearancesMap;
1295         // Like m_TypeAppearancesMap but counts total number of *characters*
1296         // instead of number of items.  Again, there is
1297         // the special value "TOTAL"
1298         TTypeToCountMap m_TypeCharsMap;
1299 
1300         void x_DumpTypeToCountMap(const TTypeToCountMap & the_map ) {
1301             ITERATE( TTypeToCountMap, map_iter, the_map ) {
1302                 cerr << setw(25) << left << (map_iter->first + ':')
1303                      << " " << map_iter->second << endl;
1304             }
1305         }
1306 
1307         void x_PrintAverageStats() {
1308             ITERATE( TTypeToCountMap, map_iter, m_TypeAppearancesMap ) {
1309                 const string sType = map_iter->first;
1310                 const size_t iAppearances = map_iter->second;
1311                 const size_t iTotalCharacters = m_TypeCharsMap[sType];
1312                 cerr << setw(25) << left << (sType + ':')
1313                      << " " << (iTotalCharacters / iAppearances) << endl;
1314             }
1315         }
1316     };
1317 
1318     if( args["demo-genbank-callback"] ) {
1319         return new CSimpleCallback;
1320     } else {
1321         return nullptr;
1322     }
1323 }
1324 
x_GetFrom(const CArgs & args)1325 TSeqPos CAsn2FlatApp::x_GetFrom(const CArgs& args)
1326 {
1327     return args["from"] ?
1328         static_cast<TSeqPos>(args["from"].AsInteger() - 1) :
1329         CRange<TSeqPos>::GetWholeFrom();
1330 }
1331 
1332 
x_GetTo(const CArgs & args)1333 TSeqPos CAsn2FlatApp::x_GetTo(const CArgs& args)
1334 {
1335     return args["to"] ?
1336         static_cast<TSeqPos>(args["to"].AsInteger() - 1) :
1337         CRange<TSeqPos>::GetWholeTo();
1338 }
1339 
x_GetStrand(const CArgs & args)1340 ENa_strand CAsn2FlatApp::x_GetStrand(const CArgs& args)
1341 {
1342     return static_cast<ENa_strand>(args["strand"].AsInteger());
1343 }
1344 
1345 
x_GetLocation(const CSeq_entry_Handle & entry,const CArgs & args,CSeq_loc & loc)1346 void CAsn2FlatApp::x_GetLocation
1347 (const CSeq_entry_Handle& entry,
1348  const CArgs& args,
1349  CSeq_loc& loc)
1350 {
1351     _ASSERT(entry);
1352 
1353     CBioseq_Handle h = x_DeduceTarget(entry);
1354     if ( !h ) {
1355         NCBI_THROW(CFlatException, eInvalidParam,
1356             "Cannot deduce target bioseq.");
1357     }
1358     TSeqPos length = h.GetInst_Length();
1359     TSeqPos from   = x_GetFrom(args);
1360     TSeqPos to     = min(x_GetTo(args), length-1);
1361     ENa_strand strand = eNa_strand_unknown;
1362     if ( args["strand"] ) {
1363         strand = x_GetStrand(args);
1364     }
1365 
1366     if ( from == CRange<TSeqPos>::GetWholeFrom() && to == length ) {
1367         // whole
1368         loc.SetWhole().Assign(*h.GetSeqId());
1369     } else {
1370         // interval
1371         loc.SetInt().SetId().Assign(*h.GetSeqId());
1372         loc.SetInt().SetFrom(from);
1373         loc.SetInt().SetTo(to);
1374         if ( strand > 0 ) {
1375             loc.SetInt().SetStrand(strand);
1376         }
1377     }
1378 }
1379 
1380 
1381 // if the 'from' or 'to' flags are specified try to guess the bioseq.
x_DeduceTarget(const CSeq_entry_Handle & entry)1382 CBioseq_Handle CAsn2FlatApp::x_DeduceTarget(const CSeq_entry_Handle& entry)
1383 {
1384     if ( entry.IsSeq() ) {
1385         return entry.GetSeq();
1386     }
1387 
1388     _ASSERT(entry.IsSet());
1389     CBioseq_set_Handle bsst = entry.GetSet();
1390     if ( !bsst.IsSetClass() ) {
1391         NCBI_THROW(CFlatException, eInvalidParam,
1392             "Cannot deduce target bioseq.");
1393     }
1394     _ASSERT(bsst.IsSetClass());
1395     switch ( bsst.GetClass() ) {
1396     case CBioseq_set::eClass_nuc_prot:
1397         // return the nucleotide
1398         for ( CSeq_entry_CI it(entry); it; ++it ) {
1399             if ( it->IsSeq() ) {
1400                 CBioseq_Handle h = it->GetSeq();
1401                 if ( h && CSeq_inst::IsNa(h.GetInst_Mol()) ) {
1402                     return h;
1403                 }
1404             }
1405         }
1406         break;
1407     case CBioseq_set::eClass_gen_prod_set:
1408         // return the genomic
1409         for ( CSeq_entry_CI it(bsst); it; ++it ) {
1410             if ( it->IsSeq() &&
1411                  it->GetSeq().GetInst_Mol() == CSeq_inst::eMol_dna ) {
1412                  return it->GetSeq();
1413             }
1414         }
1415         break;
1416     case CBioseq_set::eClass_segset:
1417         // return the segmented bioseq
1418         for ( CSeq_entry_CI it(bsst); it; ++it ) {
1419             if ( it->IsSeq() ) {
1420                 return it->GetSeq();
1421             }
1422         }
1423         break;
1424     case CBioseq_set::eClass_genbank:
1425         {
1426             CBioseq_CI bi(bsst, CSeq_inst::eMol_na);
1427             if (bi) {
1428                 return *bi;
1429             }
1430         }
1431         break;
1432     default:
1433         break;
1434     }
1435     NCBI_THROW(CFlatException, eInvalidParam,
1436             "Cannot deduce target bioseq.");
1437 }
1438 
1439 void
x_CreateCancelBenchmarkCallback()1440 CAsn2FlatApp::x_CreateCancelBenchmarkCallback()
1441 {
1442     // This ICanceled interface always says "keep going"
1443     // unless SIGUSR1 is received,
1444     // and it keeps statistics on how often it is checked
1445     class CCancelBenchmarking : public ICanceled {
1446     public:
1447         CCancelBenchmarking()
1448             : m_TimeOfLastCheck(CTime::eCurrent)
1449         {
1450         }
1451 
1452 
1453         ~CCancelBenchmarking()
1454         {
1455             // On destruction, we call "IsCanceled" one more time to make
1456             // sure there wasn't a point where we stopped
1457             // checking IsCanceled.
1458             IsCanceled();
1459 
1460             // print statistics
1461             cerr << "##### Cancel-checking benchmarks:" << endl;
1462             cerr << endl;
1463 
1464             // maybe cancelation was never checked:
1465             if( m_GapSizeMap.empty() ) {
1466                 cerr << "NO DATA" << endl;
1467                 return;
1468             }
1469 
1470             cerr << "(all times in milliseconds)" << endl;
1471             cerr << endl;
1472             // easy stats
1473             cerr << "Min: " << m_GapSizeMap.begin()->first << endl;
1474             cerr << "Max: " << m_GapSizeMap.rbegin()->first << endl;
1475 
1476             // find average
1477             Int8   iTotalMsecs = 0;
1478             size_t uTotalCalls = 0;
1479             ITERATE( TGapSizeMap, gap_size_it, m_GapSizeMap ) {
1480                 iTotalMsecs += gap_size_it->first * gap_size_it->second;
1481                 uTotalCalls += gap_size_it->second;
1482             }
1483             cerr << "Avg: " << (iTotalMsecs / uTotalCalls) << endl;
1484 
1485             // find median
1486             const size_t uIdxWithMedian = (uTotalCalls / 2);
1487             size_t uCurrentIdx = 0;
1488             ITERATE( TGapSizeMap, gap_size_it, m_GapSizeMap ) {
1489                 uCurrentIdx += gap_size_it->second;
1490                 if( uCurrentIdx >= uIdxWithMedian ) {
1491                     cerr << "Median: " << gap_size_it->first << endl;
1492                     break;
1493                 }
1494             }
1495 
1496             // first few
1497             cerr << "Chronologically first few check times: " << endl;
1498             copy( m_FirstFewGaps.begin(),
1499                 m_FirstFewGaps.end(),
1500                 ostream_iterator<Int8>(cerr, "\n") );
1501 
1502             // last few
1503             cerr << "Chronologically last few check times: " << endl;
1504             copy( m_LastFewGaps.begin(),
1505                 m_LastFewGaps.end(),
1506                 ostream_iterator<Int8>(cerr, "\n") );
1507 
1508             // slowest few and fastest few
1509             cerr << "Frequency distribution of slowest and fastest lookup times: " << endl;
1510             cerr << "\t" << "Time" << "\t" << "Count" << endl;
1511             if( m_GapSizeMap.size() <= (2 * x_kGetNumToSaveAtStartAndEnd()) ) {
1512                 // if small enough, show the whole table at once
1513                 ITERATE( TGapSizeMap, gap_size_it, m_GapSizeMap ) {
1514                     cerr << "\t" << gap_size_it ->first << "\t" << gap_size_it->second << endl;
1515                 }
1516             } else {
1517                 // table is big so only show the first few and the last few
1518                 TGapSizeMap::const_iterator gap_size_it = m_GapSizeMap.begin();
1519                 ITERATE_SIMPLE(x_kGetNumToSaveAtStartAndEnd()) {
1520                     cerr << "\t" << gap_size_it ->first << "\t" << gap_size_it->second << endl;
1521                     ++gap_size_it;
1522                 }
1523 
1524                 cout << "\t...\t..." << endl;
1525 
1526                 TGapSizeMap::reverse_iterator gap_size_rit = m_GapSizeMap.rbegin();
1527                 ITERATE_SIMPLE(x_kGetNumToSaveAtStartAndEnd()) {
1528                     cerr << "\t" << gap_size_it ->first << "\t" << gap_size_it->second << endl;
1529                     ++gap_size_rit;
1530                 }
1531             }
1532 
1533             // total checks
1534             cerr << "Num checks: " << uTotalCalls << endl;
1535         }
1536 
1537 
1538         bool IsCanceled() const override
1539         {
1540             // getting the current time should be the first
1541             // command in this function.
1542             CTime timeOfThisCheck(CTime::eCurrent);
1543 
1544             const double dDiffInNsecs =
1545                 timeOfThisCheck.DiffNanoSecond(m_TimeOfLastCheck);
1546             const Int8 iDiffInMsecs = static_cast<Int8>(dDiffInNsecs / 1000000.0);
1547 
1548             ++m_GapSizeMap[iDiffInMsecs];
1549 
1550             if( m_FirstFewGaps.size() < x_kGetNumToSaveAtStartAndEnd() ) {
1551                 m_FirstFewGaps.push_back(iDiffInMsecs);
1552             }
1553 
1554             m_LastFewGaps.push_back(iDiffInMsecs);
1555             if( m_LastFewGaps.size() > x_kGetNumToSaveAtStartAndEnd() ) {
1556                 m_LastFewGaps.pop_front();
1557             }
1558 
1559             const bool bIsCanceled =
1560                 CSignal::IsSignaled(CSignal::eSignal_USR1);
1561             if( bIsCanceled ) {
1562                 cerr << "Canceled by SIGUSR1" << endl;
1563             }
1564 
1565             // resetting m_TimeOfLastCheck should be the last command
1566             // in this function
1567             m_TimeOfLastCheck.SetCurrent();
1568             return bIsCanceled;
1569         }
1570 
1571     private:
1572         // local classes do not allow static fields
1573         size_t x_kGetNumToSaveAtStartAndEnd() const { return 10; }
1574 
1575         mutable CTime m_TimeOfLastCheck;
1576         // The key is the gap between checks in milliseconds,
1577         // (which is more than enough resolution for a human-level action)
1578         // and the value is the number of times a gap of that size occurred.
1579         typedef map<Int8, size_t> TGapSizeMap;
1580         mutable TGapSizeMap m_GapSizeMap;
1581 
1582         mutable vector<Int8> m_FirstFewGaps;
1583         mutable list<Int8>   m_LastFewGaps;
1584     };
1585 
1586     m_pCanceledCallback.reset( new CCancelBenchmarking );
1587 }
1588 
x_AddSNPAnnots(CBioseq_Handle & bsh)1589 int CAsn2FlatApp::x_AddSNPAnnots(CBioseq_Handle& bsh)
1590 {
1591     int rc = 0;
1592 
1593     // SNP annotations can be available only for nucleotide human RefSeq records
1594     if (bsh.GetInst_Mol() == CSeq_inst::eMol_aa ||
1595         sequence::GetTaxId(bsh) != TAX_ID_CONST(9606))
1596         return 0;
1597 
1598     // Also skip large scaffolds and chromosomes
1599     CConstRef<CSeq_id> accid =
1600         FindBestChoice(bsh.GetBioseqCore()->GetId(), CSeq_id::Score);
1601 
1602     bool skip = (accid->Which() != CSeq_id::e_Other);
1603     if (!skip) {
1604         string acc;
1605         accid->GetLabel(&acc, CSeq_id::eContent);
1606         string acc_prefix = acc.substr(0,2);
1607         if (acc_prefix == "NC" || acc_prefix == "AC" ||
1608             acc_prefix == "NT" || acc_prefix == "NW") {
1609             skip = true;
1610         }
1611     }
1612     if (skip)
1613         return 0;
1614 
1615     // If GenBank loader is connecting to PubSeqOS, it's sufficient to add the 'SNP'
1616     // named annot type to the scope.
1617     // Otherwise (in PSG mode), use a separate SNP data loader. For that to work,
1618     // it is necessary to find the actual NA accession corresponding to this record's
1619     // SNP annotation and add it to the SAnnotSelector used by the flatfile generator.
1620 #ifdef USE_SNPLOADER
1621     TGi gi = FindGi(bsh.GetBioseqCore()->GetId());
1622     if (gi > ZERO_GI) {
1623         ncbi::grpcapi::dbsnp::primary_track::SeqIdRequestStringAccverUnion request;
1624         request.set_gi(GI_TO(::google::protobuf::uint64, gi));
1625         ncbi::grpcapi::dbsnp::primary_track::PrimaryTrackReply reply;
1626         CGRPCClientContext context;
1627         auto snp_status = m_SNPTrackStub->ForSeqId(&context, request, &reply);
1628         if (snp_status.ok()) {
1629             string na_acc = reply.na_track_acc_with_filter();
1630             if (!na_acc.empty())
1631                 m_FFGenerator->SetAnnotSelector().IncludeNamedAnnotAccession(na_acc);
1632         }
1633     }
1634 #endif
1635 
1636     return rc;
1637 }
1638 
1639 END_NCBI_SCOPE
1640 
1641 USING_NCBI_SCOPE;
1642 
1643 
1644 /////////////////////////////////////////////////////////////////////////////
1645 //
1646 // Main
1647 
main(int argc,const char ** argv)1648 int main(int argc, const char** argv)
1649 {
1650     CScope::SetDefaultKeepExternalAnnotsForEdit(true);
1651     return CAsn2FlatApp().AppMain(argc, argv);
1652 }
1653 
1654