1 /*  $Id: biosample_chk.cpp 580815 2019-02-21 12:30:55Z choi $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   check biosource and structured comment descriptors against biosample database
30  *
31  */
32 
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistre.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/ncbiargs.hpp>
39 #include <corelib/ncbiutil.hpp>
40 
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/objectio.hpp>
44 
45 #include <connect/ncbi_core_cxx.hpp>
46 #include <connect/ncbi_util.h>
47 #include <connect/ncbi_http_session.hpp>
48 
49 // Objects includes
50 #include <objects/general/Object_id.hpp>
51 #include <objects/seq/Bioseq.hpp>
52 #include <objects/seqloc/Seq_id.hpp>
53 #include <objects/seqloc/Seq_loc.hpp>
54 #include <objects/seqloc/Seq_interval.hpp>
55 #include <objects/seq/Seq_inst.hpp>
56 #include <objects/seqfeat/BioSource.hpp>
57 #include <objects/seqfeat/SubSource.hpp>
58 #include <objects/seqfeat/Org_ref.hpp>
59 #include <objects/seqfeat/OrgName.hpp>
60 #include <objects/seqfeat/OrgMod.hpp>
61 #include <objects/seqfeat/PCRReactionSet.hpp>
62 #include <objects/seqfeat/PCRReaction.hpp>
63 #include <objects/seqfeat/PCRPrimer.hpp>
64 #include <objects/seqfeat/PCRPrimerSet.hpp>
65 #include <objects/seqfeat/PCRPrimerName.hpp>
66 #include <objects/seqfeat/PCRPrimerSeq.hpp>
67 #include <objects/seq/Pubdesc.hpp>
68 #include <objects/pub/Pub.hpp>
69 #include <objects/pub/Pub_equiv.hpp>
70 #include <objects/biblio/Cit_sub.hpp>
71 #include <objects/biblio/Cit_gen.hpp>
72 #include <objects/biblio/Auth_list.hpp>
73 #include <objects/biblio/Author.hpp>
74 #include <objects/biblio/Affil.hpp>
75 #include <objects/general/Person_id.hpp>
76 #include <objects/general/Name_std.hpp>
77 #include <objects/submit/Seq_submit.hpp>
78 #include <objects/submit/Submit_block.hpp>
79 #include <objects/submit/Contact_info.hpp>
80 #include <objects/seqset/Seq_entry.hpp>
81 #include <objtools/cleanup/cleanup.hpp>
82 #include <objects/seqtable/SeqTable_multi_data.hpp>
83 #include <objects/seqtable/SeqTable_column_info.hpp>
84 #include <util/line_reader.hpp>
85 #include <util/compress/stream_util.hpp>
86 #include <util/format_guess.hpp>
87 
88 #include <objects/seqset/Bioseq_set.hpp>
89 
90 // Object Manager includes
91 #include <objmgr/object_manager.hpp>
92 #include <objmgr/scope.hpp>
93 #include <objmgr/seq_descr_ci.hpp>
94 #include <objmgr/bioseq_handle.hpp>
95 #include <objmgr/bioseq_ci.hpp>
96 #include <objmgr/seqdesc_ci.hpp>
97 
98 #include <objtools/data_loaders/genbank/gbloader.hpp>
99 #ifdef HAVE_NCBI_VDB
100 #  include <sra/data_loaders/wgs/wgsloader.hpp>
101 #endif
102 #include <misc/jsonwrapp/jsonwrapp.hpp>
103 #include <misc/xmlwrapp/xmlwrapp.hpp>
104 
105 
106 #include <misc/biosample_util/biosample_util.hpp>
107 #include <misc/biosample_util/struc_table_column.hpp>
108 
109 #include <common/test_assert.h>  /* This header must go last */
110 
111 
112 using namespace ncbi;
113 using namespace objects;
114 using namespace xml;
115 
116 const char * BIOSAMPLE_CHK_APP_VER = "1.0";
117 
118 /////////////////////////////////////////////////////////////////////////////
119 //
120 //  Demo application
121 //
122 
123 
124 class CBiosampleHandler
125 {
126 public:
CBiosampleHandler()127     CBiosampleHandler() :
128         m_ReportStream(0),
129         m_UseDevServer(false),
130         m_Username(""),
131         m_Password("")
132         {}
133 
~CBiosampleHandler()134     virtual ~CBiosampleHandler() {}
135 
ProcessBioseq(CBioseq_Handle bh)136     virtual void ProcessBioseq(CBioseq_Handle bh) {}
NeedsReportStream()137     virtual bool NeedsReportStream() { return false; }
AddSummary()138     virtual void AddSummary() {}
139 
SetReportStream(CNcbiOstream * stream)140     void SetReportStream(CNcbiOstream* stream) { m_ReportStream = stream; }
141 
142 protected:
143     CNcbiOstream* m_ReportStream;
144     bool m_UseDevServer;
145     string m_Username;
146     string m_Password;
147 };
148 
149 
150 class CBiosampleStatusReport : public CBiosampleHandler
151 {
152 public:
CBiosampleStatusReport()153     CBiosampleStatusReport() : CBiosampleHandler() {}
~CBiosampleStatusReport()154     virtual ~CBiosampleStatusReport() {}
155     virtual void ProcessBioseq(CBioseq_Handle bh);
NeedsReportStream()156     virtual bool NeedsReportStream() { return true; }
157     virtual void AddSummary();
158 
159 protected:
160     biosample_util::TStatuses m_Status;
161 };
162 
163 
ProcessBioseq(CBioseq_Handle bsh)164 void CBiosampleStatusReport::ProcessBioseq(CBioseq_Handle bsh)
165 {
166     vector<string> ids = biosample_util::GetBiosampleIDs(bsh);
167     if (ids.empty()) {
168         return;
169     }
170 
171     for (const auto &it : ids) {
172         if (m_Status.find(it) == m_Status.end()) {
173             biosample_util::TStatus new_pair(it, biosample_util::eStatus_Unknown);
174             m_Status.insert(new_pair);
175         }
176     }
177 }
178 
AddSummary()179 void CBiosampleStatusReport::AddSummary()
180 {
181     if (m_Status.empty()) {
182         *m_ReportStream << "No BioSample IDs found" << endl;
183     } else {
184         biosample_util::GetBiosampleStatus(m_Status, m_UseDevServer);
185         biosample_util::TStatuses::iterator it = m_Status.begin();
186         while (it != m_Status.end()) {
187             *m_ReportStream << it->first << "\t" << biosample_util::GetBiosampleStatusName(it->second) << endl;
188             ++it;
189         }
190     }
191     m_Status.clear();
192 }
193 
194 
195 class CBiosampleChkApp : public CNcbiApplication, CReadClassMemberHook
196 {
197 public:
198     CBiosampleChkApp(void);
199 
200     virtual void Init(void);
201     virtual int  Run (void);
202 
203     void ReadClassMember(CObjectIStream& in,
204         const CObjectInfo::CMemberIterator& member);
205 
206 private:
207 
208     void Setup(const CArgs& args);
209 
210     auto_ptr<CObjectIStream> OpenFile(const CArgs& args);
211     auto_ptr<CObjectIStream> OpenFile(const string &fname);
212     void SaveFile(const string &fname, bool useBinaryOutputFormat);
213 
214     void GetBioseqDiffs(CBioseq_Handle bh);
215     void PushToRecord(CBioseq_Handle bh);
216 
217     void ProcessBioseqForUpdate(CBioseq_Handle bh);
218     void ProcessBioseqHandle(CBioseq_Handle bh);
219     void ProcessSeqEntry(CRef<CSeq_entry> se);
220     void ProcessSeqEntry(void);
221     void ProcessSet(void);
222     void ProcessSeqSubmit(void);
223     void ProcessAsnInput (void);
224     void ProcessList (const string& fname);
225     void ProcessFileList (const string& fname);
226     int ProcessOneDirectory(const string& dir_name, const string& file_suffix, const string& file_mask, bool recurse);
227     void ProcessOneFile(string fname);
228     void ProcessReleaseFile(const CArgs& args);
229     CRef<CSeq_entry> ReadSeqEntry(void);
230     CRef<CBioseq_set> ReadBioseqSet(void);
231 
232     void CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList& diffs, bool del_okay);
233     void PrintResults(biosample_util::TBiosampleFieldDiffList& diffs);
234     void PrintDiffs(biosample_util::TBiosampleFieldDiffList& diffs);
235     void PrintTable(CRef<CSeq_table> table);
236 
237     CRef<CScope> BuildScope(void);
238 
239     // for mode 3, biosample_push
240     void UpdateBioSource (CBioseq_Handle bh, const CBioSource& src);
241     vector<CRef<CSeqdesc> > GetBiosampleDescriptors(string fname);
242     vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqSubmit();
243     vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqEntry();
244     vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry& se);
245 
246     CRef<CObjectManager> m_ObjMgr;
247     auto_ptr<CObjectIStream> m_In;
248     bool m_Continue;
249 
250     size_t m_Level;
251 
252     CNcbiOstream* m_ReportStream;
253     bool m_NeedReportHeader;
254     CNcbiOfstream* m_AsnOut;
255     CNcbiOstream* m_LogStream;
256 
257     enum E_Mode {
258         e_report_diffs = 1,     // Default - report diffs between biosources on records with biosample accessions
259                                 // and biosample data
260         e_generate_biosample,
261         e_push,
262         e_take_from_biosample,         // update with qualifiers from BioSample, stop if conflict
263         e_take_from_biosample_force,   // update with qualifiers from BioSample, no stop on conflict
264         e_report_status,               // make table with list of BioSample IDs and statuses
265         e_update_with,                 // use web API for update (with delete)
266         e_update_no                    // use web API for update (no delete)
267     };
268 
269     enum E_ListType {
270         e_none = 0,
271         e_accessions,
272         e_files
273     };
274 
275     int m_Mode;
276     int m_ReturnCode;
277     int m_ListType;
278     string m_StructuredCommentPrefix;
279     bool m_CompareStructuredComments;
280     bool m_UseDevServer;
281     bool m_FirstSeqOnly;
282     string m_Username;
283     string m_Password;
284     string m_IDPrefix;
285     string m_HUPDate;
286     string m_BioSampleAccession;
287     string m_BioProjectAccession;
288     string m_Owner;
289     string m_Comment;
290 
291     size_t m_Processed;
292     size_t m_Unprocessed;
293 
294     biosample_util::TBiosampleFieldDiffList m_Diffs;
295     CRef<CSeq_table> m_Table;
296     vector<CRef<CSeqdesc> > m_Descriptors;
297 
298     CBiosampleHandler * m_Handler;
299 
300     biosample_util::TBioSamples m_cache;
301 };
302 
303 
CBiosampleChkApp(void)304 CBiosampleChkApp::CBiosampleChkApp(void) :
305     m_ObjMgr(0), m_In(0), m_Continue(false),
306     m_Level(0), m_ReportStream(0), m_NeedReportHeader(true), m_AsnOut(0),
307     m_LogStream(0), m_Mode(e_report_diffs), m_ReturnCode(0),
308     m_StructuredCommentPrefix(""), m_CompareStructuredComments(true),
309     m_FirstSeqOnly(false), m_IDPrefix(""), m_HUPDate(""),
310     m_BioSampleAccession(""), m_BioProjectAccession(""),
311     m_Owner(""), m_Comment(""),
312     m_Processed(0), m_Unprocessed(0), m_Handler(NULL)
313 {
314 }
315 
316 
Init(void)317 void CBiosampleChkApp::Init(void)
318 {
319     // Prepare command line descriptions
320 
321     // Create
322     auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
323 
324     arg_desc->AddOptionalKey
325         ("p", "Directory", "Path to ASN.1 Files",
326         CArgDescriptions::eInputFile);
327     arg_desc->AddOptionalKey
328         ("i", "InFile", "Single Input File",
329         CArgDescriptions::eInputFile);
330     arg_desc->AddOptionalKey(
331         "o", "OutFile", "Single Output File",
332         CArgDescriptions::eOutputFile);
333     arg_desc->AddOptionalKey(
334         "f", "Filter", "Substring Filter",
335         CArgDescriptions::eOutputFile);
336     arg_desc->AddDefaultKey
337         ("x", "String", "File Selection Substring", CArgDescriptions::eString, ".sqn");
338     arg_desc->AddFlag("u", "Recurse");
339     arg_desc->AddFlag("d", "Use development Biosample server");
340 
341     arg_desc->AddDefaultKey("a", "a",
342                             "ASN.1 Type (a Automatic, z Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit) or accession list (l)",
343                             CArgDescriptions::eString,
344                             "a");
345 
346     arg_desc->AddFlag("b", "Output binary ASN.1");
347     //arg_desc->AddFlag("c", "Batch File is Compressed");
348     arg_desc->AddFlag("M", "Process only first sequence in file (master)");
349     arg_desc->AddOptionalKey("R", "BioSampleIDPrefix", "BioSample ID Prefix", CArgDescriptions::eString);
350     arg_desc->AddOptionalKey("HUP", "HUPDate", "Hold Until Publish Date", CArgDescriptions::eString);
351 
352     arg_desc->AddOptionalKey(
353         "L", "OutFile", "Log File",
354         CArgDescriptions::eOutputFile);
355 
356     arg_desc->AddDefaultKey(
357         "m", "mode", "Mode:\n"
358         "\t1 create update file\n"
359         "\t2 generate file for creating new biosample entries\n"
360         "\t3 push source info from one file (-i) to others (-p)\n"
361         "\t4 update with source qualifiers from BioSample unless conflict\n"
362         "\t5 update with source qualifiers from BioSample (continue with conflict))\n"
363         "\t6 report transaction status\n"
364         "\t7 use web API for update (with delete)\n"
365         "\t8 use web API for update (no delete)\n",
366         CArgDescriptions::eInteger, "1");
367     CArgAllow* constraint = new CArgAllow_Integers(e_report_diffs, e_update_no);
368     arg_desc->SetConstraint("m", constraint);
369 
370     arg_desc->AddOptionalKey(
371         "P", "Prefix", "StructuredCommentPrefix", CArgDescriptions::eString);
372 
373     arg_desc->AddOptionalKey(
374         "biosample", "BioSampleAccession", "BioSample Accession to use for sequences in record. Report error if sequences contain a reference to a different BioSample accession.", CArgDescriptions::eString);
375     arg_desc->AddOptionalKey(
376         "bioproject", "BioProjectAccession", "BioProject Accession to use for sequences in record. Report error if sequences contain a reference to a different BioProject accession.", CArgDescriptions::eString);
377     arg_desc->AddOptionalKey("comment", "BioSampleComment", "Comment to use for creating new BioSample xml", CArgDescriptions::eString);
378 
379     arg_desc->AddOptionalKey
380         ("authorize", "AuthorizeFile", "Username and Password File",
381         CArgDescriptions::eInputFile);
382 
383     arg_desc->AddOptionalKey("username", "ApiUsername", "Username", CArgDescriptions::eString);
384     arg_desc->AddOptionalKey("password", "ApiPassword", "Password", CArgDescriptions::eString);
385 
386     // Program description
387     string prog_description = "BioSample Checker\n";
388     arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
389         prog_description, false);
390 
391     // Pass argument descriptions to the application
392     SetupArgDescriptions(arg_desc.release());
393 
394 }
395 
396 
ProcessAsnInput(void)397 void CBiosampleChkApp::ProcessAsnInput (void)
398 {
399     // Process file based on its content
400     // Unless otherwise specifien we assume the file in hand is
401     // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
402     // Release file (batch processing) where we process each Seq-entry
403     // at a time.
404     string header = m_In->ReadFileHeader();
405 
406     bool unhandled = false;
407     try {
408         if (header == "Seq-submit" ) {  // Seq-submit
409             ProcessSeqSubmit();
410         } else if ( header == "Seq-entry" ) {           // Seq-entry
411             ProcessSeqEntry();
412         } else if (header == "Bioseq-set" ) {  // Bioseq-set
413             ProcessSet();
414         } else {
415             unhandled = true;
416         }
417     } catch (CException& e) {
418         if (NStr::StartsWith(e.GetMsg(), "duplicate Bioseq id")) {
419             *m_LogStream << e.GetMsg();
420             exit(4);
421         } else {
422             throw e;
423         }
424     }
425     if (unhandled) {
426         NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
427     }
428 
429 }
430 
431 
ProcessList(const string & fname)432 void CBiosampleChkApp::ProcessList (const string& fname)
433 {
434     // Process file with list of accessions
435 
436     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
437     CGBDataLoader::RegisterInObjectManager(*objmgr);
438 #ifdef HAVE_NCBI_VDB
439     CWGSDataLoader::RegisterInObjectManager(*objmgr);
440 #endif
441     CScope scope(*objmgr);
442     scope.AddDefaults();
443 
444     CRef<ILineReader> lr = ILineReader::New (fname);
445     while ( !lr->AtEOF() ) {
446         CTempString line = *++*lr;
447         if (!NStr::IsBlank(line)) {
448             try {
449                 CRef<CSeq_id> id(new CSeq_id(line));
450                 if (id) {
451                     CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
452                     if (bsh) {
453                         ProcessBioseqHandle(bsh);
454                     } else {
455                         *m_LogStream << "Unable to fetch Bioseq for " << line << endl;
456                         string label = "";
457                         id->GetLabel(&label);
458                         *m_LogStream << "  (interpreted as " << label << ")" << endl;
459                         m_Unprocessed++;
460                     }
461                 }
462             } catch (CException& e) {
463                 *m_LogStream << e.GetMsg() << endl;
464                 m_Unprocessed++;
465             }
466         }
467     }
468 
469 }
470 
471 
ProcessFileList(const string & fname)472 void CBiosampleChkApp::ProcessFileList (const string& fname)
473 {
474     // Process file with list of files
475 
476     CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
477     CGBDataLoader::RegisterInObjectManager(*objmgr);
478     CScope scope(*objmgr);
479     scope.AddDefaults();
480 
481     m_ListType = e_none;
482     CRef<ILineReader> lr = ILineReader::New (fname);
483     while ( !lr->AtEOF() ) {
484         CTempString line = *++*lr;
485         if (!NStr::IsBlank(line)) {
486             ProcessOneFile(line);
487         }
488     }
489     m_ListType = e_files;
490 }
491 
492 
ProcessOneFile(string fname)493 void CBiosampleChkApp::ProcessOneFile(string fname)
494 {
495     const CArgs& args = GetArgs();
496 
497     bool need_to_close_report = false;
498     bool need_to_close_asn = false;
499 
500     if (!m_ReportStream &&
501         (m_Mode == e_report_diffs || m_Mode == e_update_with || m_Mode == e_update_no || m_Mode == e_take_from_biosample || m_Mode == e_report_status ||
502          (m_Handler != NULL && m_Handler->NeedsReportStream()))) {
503         string path = fname;
504         size_t pos = NStr::Find(path, ".", NStr::eCase, NStr::eReverseSearch);
505         if (pos != string::npos) {
506             path = path.substr(0, pos);
507         }
508         path = path + ".val";
509         m_Table.Reset(new CSeq_table());
510         m_Table->SetNum_rows(0);
511         m_ReportStream = new CNcbiOfstream(path.c_str());
512         if (!m_ReportStream)
513         {
514             NCBI_THROW(CException, eUnknown, "Unable to open " + path);
515         }
516         need_to_close_report = true;
517         m_NeedReportHeader = true;
518         if (m_Handler && m_Handler->NeedsReportStream()) {
519             m_Handler->SetReportStream(m_ReportStream);
520         }
521     }
522     if (!m_AsnOut && (m_Mode == e_push || m_Mode == e_take_from_biosample || m_Mode == e_take_from_biosample_force)) {
523         string path = fname;
524         size_t pos = NStr::Find(path, ".", NStr::eCase, NStr::eReverseSearch);
525         if (pos != string::npos) {
526             path = path.substr(0, pos);
527         }
528         path = path + ".out";
529         SaveFile(path, args["b"]);
530         need_to_close_asn = true;
531     }
532 
533     m_Diffs.clear();
534     switch (m_ListType) {
535         case e_accessions:
536             ProcessList (fname);
537             break;
538         case e_files:
539             ProcessFileList (fname);
540             break;
541         case e_none:
542             m_In = OpenFile(fname);
543             if (m_In.get() == nullptr) {
544                 NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
545             }
546             if (!m_In->InGoodState()) {
547                 NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
548             }
549             ProcessAsnInput();
550             break;
551     }
552 
553     if (m_Mode == e_report_diffs) {
554         PrintResults(m_Diffs);
555     }
556     if (m_Mode == e_update_with) {
557         CreateBiosampleUpdateWebService(m_Diffs, true);
558     } else if (m_Mode == e_update_no) {
559         CreateBiosampleUpdateWebService(m_Diffs, false);
560     }
561     if (m_Handler != NULL) {
562         m_Handler->AddSummary();
563     }
564 
565     // TODO! Must free diffs
566     m_Diffs.clear();
567 
568     if (need_to_close_report) {
569         if (m_Mode == e_take_from_biosample) {
570             PrintTable(m_Table);
571             m_Table->Reset();
572             m_Table = new CSeq_table();
573             m_Table->SetNum_rows(0);
574         }
575         m_ReportStream->flush();
576         m_ReportStream = 0;
577     }
578     if (need_to_close_asn) {
579         m_AsnOut->flush();
580         m_AsnOut->close();
581         m_AsnOut = 0;
582     }
583 }
584 
585 
GetBiosampleDescriptorsFromSeqEntry(void)586 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptorsFromSeqEntry(void)
587 {
588     // Get seq-entry to process
589     CRef<CSeq_entry> se(ReadSeqEntry());
590 
591     return GetBiosampleDescriptorsFromSeqEntry(*se);
592 }
593 
594 
GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry & se)595 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry& se)
596 {
597     vector<CRef<CSeqdesc> > descriptors;
598 
599     CRef<CScope> scope = BuildScope();
600     CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(se);
601     CBioseq_CI bi(seh, CSeq_inst::eMol_na);
602     if (bi) {
603         CSeqdesc_CI src_desc_ci(*bi, CSeqdesc::e_Source);
604         if (src_desc_ci) {
605             CRef<CSeqdesc> src_desc(new CSeqdesc());
606             src_desc->Assign(*src_desc_ci);
607             descriptors.push_back(src_desc);
608         }
609     }
610 
611     return descriptors;
612 }
613 
614 
GetBiosampleDescriptorsFromSeqSubmit()615 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptorsFromSeqSubmit()
616 {
617     vector<CRef<CSeqdesc> > descriptors;
618     CRef<CSeq_submit> ss(new CSeq_submit);
619 
620     // Get seq-submit to process
621     m_In->Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
622 
623     // Validae Seq-submit
624     CRef<CScope> scope = BuildScope();
625     if (ss->GetData().IsEntrys() && ! ss->GetData().GetEntrys().empty()) {
626         descriptors = GetBiosampleDescriptorsFromSeqEntry(**(ss->GetData().GetEntrys().begin()));
627     }
628     return descriptors;
629 }
630 
631 
GetBiosampleDescriptors(string fname)632 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptors(string fname)
633 {
634     m_In = OpenFile(fname);
635 
636     // Process file based on its content
637     // Unless otherwise specifien we assume the file in hand is
638     // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
639     // Release file (batch processing) where we process each Seq-entry
640     // at a time.
641 
642     string header = m_In->ReadFileHeader();
643 
644     vector<CRef<CSeqdesc> > descriptors;
645     if (header == "Seq-submit" ) {  // Seq-submit
646         descriptors = GetBiosampleDescriptorsFromSeqSubmit();
647     } else if ( header == "Seq-entry" ) {           // Seq-entry
648         descriptors = GetBiosampleDescriptorsFromSeqEntry();
649 
650     } else {
651         NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
652     }
653     return descriptors;
654 }
655 
656 
ProcessOneDirectory(const string & dir_name,const string & file_suffix,const string & file_mask,bool recurse)657 int CBiosampleChkApp::ProcessOneDirectory(const string& dir_name, const string& file_suffix, const string& file_mask, bool recurse)
658 {
659     int num_of_files = 0;
660 
661     CDir dir(dir_name);
662     CDir::TEntries files (dir.GetEntries(file_mask, CDir::eFile));
663     for (const auto &ii : files) {
664         string fname = ii->GetName();
665         if (ii->IsFile() &&
666             (!file_suffix.empty() || NStr::Find (fname, file_suffix) != string::npos)) {
667             ++num_of_files;
668             string fname = CDirEntry::MakePath(dir_name, ii->GetName());
669             ProcessOneFile (fname);
670         }
671     }
672     if (recurse) {
673         CDir::TEntries subdirs (dir.GetEntries("", CDir::eDir));
674         for (const auto &ii : subdirs) {
675             string subdir = ii->GetName();
676             if (ii->IsDir() && !NStr::Equal(subdir, ".") && !NStr::Equal(subdir, "..")) {
677                 string subname = CDirEntry::MakePath(dir_name, ii->GetName());
678                 num_of_files += ProcessOneDirectory (subname, file_suffix, file_mask, recurse);
679             }
680         }
681     }
682     if (!num_of_files)
683     {
684         NCBI_THROW(CException, eUnknown, "No input '" + file_mask + "' files found in directory '" + dir_name + "'");
685     }
686     return num_of_files;
687 }
688 
689 
Run(void)690 int CBiosampleChkApp::Run(void)
691 {
692     const CArgs& args = GetArgs();
693     Setup(args);
694 
695     m_Mode = args["m"].AsInteger();
696     m_FirstSeqOnly = args["M"].AsBoolean();
697     m_IDPrefix = args["R"] ? args["R"].AsString() : "";
698     m_HUPDate = args["HUP"] ? args["HUP"].AsString() : "";
699     m_BioSampleAccession = args["biosample"] ? args["biosample"].AsString() : "";
700     m_BioProjectAccession = args["bioproject"] ? args["bioproject"].AsString() : "";
701     m_Comment = args["comment"] ? args["comment"].AsString() : "";
702 
703     if (m_Mode == e_report_status) {
704         m_Handler = new CBiosampleStatusReport();
705     }
706 
707     if (args["o"]) {
708         if (m_Mode == e_report_diffs || m_Mode == e_generate_biosample
709             //|| m_Mode == e_take_from_biosample
710             || (m_Handler != NULL && m_Handler->NeedsReportStream())) {
711             m_ReportStream = &(args["o"].AsOutputFile());
712             if (!m_ReportStream)
713             {
714                 NCBI_THROW(CException, eUnknown, "Unable to open " + args["o"].AsString());
715             }
716             if (m_Handler) {
717                 m_Handler->SetReportStream(m_ReportStream);
718             }
719             if (m_Mode == e_take_from_biosample) {
720                 m_Table.Reset(new CSeq_table());
721                 m_Table->SetNum_rows(0);
722             }
723         } else {
724             SaveFile(args["o"].AsString(), args["b"]);
725         }
726     } else if (m_Mode == e_update_with || m_Mode == e_update_no) {
727             m_ReportStream = &NcbiCout;
728             if (!m_ReportStream)
729             {
730                 NCBI_THROW(CException, eUnknown, "Unable to open " + args["o"].AsString());
731             }
732             if (m_Handler) {
733                 m_Handler->SetReportStream(m_ReportStream);
734             }
735             if (m_Mode == e_take_from_biosample) {
736                 m_Table.Reset(new CSeq_table());
737                 m_Table->SetNum_rows(0);
738             }
739     }
740 
741     m_LogStream = args["L"] ? &(args["L"].AsOutputFile()) : &NcbiCout;
742     m_StructuredCommentPrefix = args["P"] ? args["P"].AsString() : "";
743     if (!NStr::IsBlank(m_StructuredCommentPrefix) && !NStr::StartsWith(m_StructuredCommentPrefix, "##")) {
744         m_StructuredCommentPrefix = "##" + m_StructuredCommentPrefix;
745     }
746 
747     m_UseDevServer = args["d"].AsBoolean();
748 
749     if (args["authorize"]) {
750         CNcbiIfstream infile(args["authorize"].AsString().c_str());
751         string line;
752         while (NcbiGetlineEOL(infile, line)) {
753             if (m_Username.empty()) {
754                 m_Username = line;
755             } else if (m_Password.empty()) {
756                 m_Password = line;
757             } else {
758                 break;
759             }
760         }
761     } else {
762       m_Username = args["username"] ? args["username"].AsString() : "";
763       m_Password = args["password"] ? args["password"].AsString() : "";
764     }
765     NStr::TruncateSpacesInPlace(m_Username);
766     NStr::TruncateSpacesInPlace(m_Password);
767 
768     if (!NStr::IsBlank(m_StructuredCommentPrefix) && m_Mode != e_generate_biosample) {
769         // error
770         *m_LogStream << "Structured comment prefix is only appropriate for generating a biosample table." << endl;
771         return 1;
772     }
773 
774     if (m_Mode == e_report_diffs) {
775         m_CompareStructuredComments = false;
776     }
777 
778     // Process file based on its content
779     // Unless otherwise specified we assume the file in hand is
780     // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
781     // Release file (batch processing) where we process each Seq-entry
782     // at a time.
783     if (NStr::Equal(args["a"].AsString(), "l")) {
784         m_ListType = e_accessions;
785     } else if (NStr::Equal(args["a"].AsString(), "f")) {
786         m_ListType = e_files;
787     } else {
788         m_ListType = e_none;
789     }
790 
791     string dir_name    = (args["p"]) ? args["p"].AsString() : "";
792     string file_suffix = (args["f"]) ? args["f"].AsString() : "";
793     string file_mask   = (args["x"]) ? args["x"].AsString() : ".sqn";
794     file_mask = "*" + file_mask;
795     bool dir_recurse   = args["u"];
796     if (m_Mode == e_report_status && !NStr::IsBlank(m_BioSampleAccession)) {
797         biosample_util::EStatus status = biosample_util::GetBiosampleStatus(m_BioSampleAccession, m_UseDevServer);
798         if (m_ReportStream) {
799             *m_ReportStream << m_BioSampleAccession << "\t" << biosample_util::GetBiosampleStatusName(status) << endl;
800         } else {
801             NcbiCout << m_BioSampleAccession << "\t" << biosample_util::GetBiosampleStatusName(status) << endl;
802         }
803     } else if ( m_Mode == e_push) {
804         if (m_ListType != e_none) {
805             // error
806             *m_LogStream << "List type (-a l or -a f) is not appropriate for push mode." << endl;
807             return 1;
808         } else if (!args["p"] || !args["i"]) {
809             // error
810             *m_LogStream << "Both directory containing contigs (-p) and master file (-i) are required for push mode." << endl;
811             return 1;
812         } else {
813             m_Descriptors = GetBiosampleDescriptors(args["i"].AsString());
814             ProcessOneDirectory (dir_name, file_suffix, file_mask, dir_recurse);
815         }
816     } else if ( args["p"] ) {
817         ProcessOneDirectory (dir_name, file_suffix, file_mask, dir_recurse);
818         if (m_Mode == e_take_from_biosample) {
819             if (m_Table && m_Table->GetNum_rows() > 0) {
820                 PrintTable(m_Table);
821             }
822         }
823     } else {
824         if (args["i"]) {
825             ProcessOneFile (args["i"].AsString());
826             if (m_Mode == e_take_from_biosample) {
827                 if (m_Table && m_Table->GetNum_rows() > 0) {
828                     PrintTable(m_Table);
829                 }
830             }
831         }
832     }
833 
834     if (m_Unprocessed > 0) {
835         if (m_Mode != e_report_diffs) {
836             *m_LogStream << m_Unprocessed << " results failed" << endl;
837         }
838         return 1;
839     } else {
840         return m_ReturnCode;
841     }
842 }
843 
844 
BuildScope(void)845 CRef<CScope> CBiosampleChkApp::BuildScope (void)
846 {
847     CRef<CScope> scope(new CScope (*m_ObjMgr));
848     scope->AddDefaults();
849 
850     return scope;
851 }
852 
853 
ReadClassMember(CObjectIStream & in,const CObjectInfo::CMemberIterator & member)854 void CBiosampleChkApp::ReadClassMember
855 (CObjectIStream& in,
856  const CObjectInfo::CMemberIterator& member)
857 {
858     m_Level++;
859 
860     if ( m_Level == 1 ) {
861         size_t n = 0;
862         // Read each element separately to a local TSeqEntry,
863         // process it somehow, and... not store it in the container.
864         for ( CIStreamContainerIterator i(in, member); i; ++i ) {
865             try {
866                 // Get seq-entry to process
867                 CRef<CSeq_entry> se(new CSeq_entry);
868                 i >> *se;
869 
870                 CStopWatch sw(CStopWatch::eStart);
871 
872                 m_Diffs.clear();
873                 ProcessSeqEntry(se);
874                 PrintResults(m_Diffs);
875                 // TODO! Must free diffs
876                 m_Diffs.clear();
877 
878                 if (m_ReportStream) {
879                     *m_ReportStream << "Elapsed = " << sw.Elapsed() << endl;
880                 }
881                 n++;
882             } catch (std::exception e) {
883                 if ( !m_Continue ) {
884                     throw;
885                 }
886                 // should we issue some sort of warning?
887             }
888         }
889     } else {
890         in.ReadClassMember(member);
891     }
892 
893     m_Level--;
894 }
895 
896 
ProcessReleaseFile(const CArgs & args)897 void CBiosampleChkApp::ProcessReleaseFile
898 (const CArgs& args)
899 {
900     CRef<CBioseq_set> seqset(new CBioseq_set);
901 
902     // Register the Seq-entry hook
903     CObjectTypeInfo set_type = CType<CBioseq_set>();
904     set_type.FindMember("seq-set").SetLocalReadHook(*m_In, this);
905 
906     // Read the CBioseq_set, it will call the hook object each time we
907     // encounter a Seq-entry
908     *m_In >> *seqset;
909 }
910 
911 
ReadSeqEntry(void)912 CRef<CSeq_entry> CBiosampleChkApp::ReadSeqEntry(void)
913 {
914     CRef<CSeq_entry> se(new CSeq_entry);
915     m_In->Read(ObjectInfo(*se), CObjectIStream::eNoFileHeader);
916 
917     return se;
918 }
919 
920 
ReadBioseqSet(void)921 CRef<CBioseq_set> CBiosampleChkApp::ReadBioseqSet(void)
922 {
923     CRef<CBioseq_set> set(new CBioseq_set());
924     m_In->Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
925 
926     return set;
927 }
928 
929 
PrintTable(CRef<CSeq_table> table)930 void CBiosampleChkApp::PrintTable(CRef<CSeq_table> table)
931 {
932     if (table->GetNum_rows() == 0) {
933         // do nothing
934         return;
935     }
936 
937     for (const auto &it : table->GetColumns()) {
938         *m_ReportStream << it->GetHeader().GetTitle() << "\t";
939     }
940     *m_ReportStream << endl;
941     for (size_t row = 0; row < (size_t)table->GetNum_rows(); row++) {
942         for (const auto &it : table->GetColumns()) {
943             if (row < it->GetData().GetString().size()) {
944                 *m_ReportStream << it->GetData().GetString()[row] << "\t";
945             } else {
946                 *m_ReportStream << "\t";
947             }
948         }
949         *m_ReportStream << endl;
950     }
951 }
952 
953 
PrintDiffs(biosample_util::TBiosampleFieldDiffList & diffs)954 void CBiosampleChkApp::PrintDiffs(biosample_util::TBiosampleFieldDiffList & diffs)
955 {
956     if (diffs.empty()) {
957         if (m_Processed == 0) {
958             *m_ReportStream << "No results processed" << endl;
959         } else {
960             *m_ReportStream << "No differences found" << endl;
961         }
962     } else {
963         if (m_NeedReportHeader) {
964             biosample_util::CBiosampleFieldDiff::PrintHeader(*m_ReportStream, false);
965             m_NeedReportHeader = false;
966         }
967 
968         for (const auto &it : diffs) {
969             it->Print(*m_ReportStream, false);
970         }
971     }
972     if (m_Unprocessed > 0) {
973         *m_ReportStream << m_Unprocessed << " results failed" << endl;
974     }
975 }
976 
977 
PrintResults(biosample_util::TBiosampleFieldDiffList & diffs)978 void CBiosampleChkApp::PrintResults(biosample_util::TBiosampleFieldDiffList & diffs)
979 {
980     PrintDiffs(diffs);
981 }
982 
983 
CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList & diffs,bool del_okay)984 void CBiosampleChkApp::CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList & diffs, bool del_okay)
985 {
986     if (diffs.empty()) {
987         return;
988     }
989 
990     vector< CRef<biosample_util::CBiosampleFieldDiff> > add_item;
991     vector< CRef<biosample_util::CBiosampleFieldDiff> > change_item;
992     vector< CRef<biosample_util::CBiosampleFieldDiff> > delete_item;
993     vector< CRef<biosample_util::CBiosampleFieldDiff> > change_organism;
994 
995     set<string> ids;
996 
997     for (const auto &it : diffs) {
998         string id = it->GetBioSample();
999         string smp = it->GetSampleVal();
1000         string src = it->GetSrcVal();
1001         string fld = it->GetFieldName();
1002         bool blank_smp = NStr::IsBlank(smp);
1003         bool blank_src = NStr::IsBlank(src);
1004         if (blank_smp && blank_src) {
1005             continue;
1006         }
1007         if (smp == src) {
1008             continue;
1009         }
1010         ids.insert(id);
1011         if (fld == "Organism Name") {
1012             change_organism.push_back(it);
1013         } else if (blank_smp) {
1014             add_item.push_back(it);
1015         } else if (blank_src) {
1016             if (del_okay) {
1017                 delete_item.push_back(it);
1018             }
1019         } else {
1020             change_item.push_back(it);
1021         }
1022     }
1023 
1024     CJson_Document req;
1025     CJson_Object top_obj = req.SetObject();
1026     CJson_Array biosample_array = top_obj.insert_array("update");
1027 
1028     CJson_Object options_obj = top_obj.insert_object("options");
1029     options_obj.insert("attribute_synonyms", "true");
1030 
1031     for (auto& id : ids) {
1032         CJson_Object obj1 = biosample_array.push_back_object();
1033         obj1.insert("samples", id);
1034 
1035         if (! add_item.empty()) {
1036             CJson_Object add_obj = obj1.insert_object("add");
1037             CJson_Array add_arr = add_obj.insert_array("attribute");
1038             for (auto& itm : add_item) {
1039                 CJson_Object obj2 = add_arr.push_back_object();
1040                 obj2.insert("name", itm->GetFieldName());
1041                 obj2.insert("new_value", itm->GetSrcVal());
1042             }
1043         }
1044 
1045         if (! delete_item.empty()) {
1046             CJson_Object del_obj = obj1.insert_object("delete");
1047             CJson_Array del_arr = del_obj.insert_array("attribute");
1048             for (auto& itm : delete_item) {
1049                 CJson_Object obj2 = del_arr.push_back_object();
1050                 obj2.insert("name", itm->GetFieldName());
1051                 obj2.insert("old_value", itm->GetSampleVal());
1052             }
1053         }
1054 
1055         if (! change_item.empty() || ! change_organism.empty()) {
1056             CJson_Object chg_obj = obj1.insert_object("change");
1057             if (! change_organism.empty()) {
1058                 CJson_Object chg_org = chg_obj.insert_object("organism");
1059                 for (auto& itm : change_organism) {
1060                     chg_org.insert("new_value", itm->GetSrcVal());
1061                 }
1062             }
1063             if (! change_item.empty()) {
1064                 CJson_Array chg_arr = chg_obj.insert_array("attribute");
1065                 for (auto& itm : change_item) {
1066                     string fld = itm->GetFieldName();
1067                     if (fld == "Tax ID") {
1068                         continue;
1069                     }
1070                     CJson_Object obj2 = chg_arr.push_back_object();
1071                     obj2.insert("name", fld);
1072                     obj2.insert("old_value", itm->GetSampleVal());
1073                     obj2.insert("new_value", itm->GetSrcVal());
1074                 }
1075             }
1076         }
1077     }
1078 
1079     if ( ids.size() > 1 ) {
1080         *m_LogStream << "ERROR: More than one BioSample ID is not supported by -m 7." << endl;
1081         exit(6);
1082     }
1083 
1084     string sData = req.ToString();
1085 
1086     NcbiCout << sData << endl;
1087 
1088     CHttpSession session;
1089 
1090     if (m_Username == "" || m_Password == "") {
1091         *m_LogStream << "ERROR: Username and password are needed with -m 7." << endl;
1092         exit(6);
1093     }
1094 
1095     // MyNCBI signin
1096     string sUrl = "https://www.ncbi.nlm.nih.gov/portal/signin.cgi?js";
1097     CHttpRequest request = session.NewRequest(sUrl, CHttpSession::ePost);
1098     request.SetRetries(0);
1099 
1100     CHttpFormData& data = request.FormData();
1101     data.AddEntry("cmd", "signin");
1102     data.AddEntry("surl", "dummy");
1103     data.AddEntry("furl", "dummy");
1104     data.AddEntry("rrme", "1");
1105     data.AddEntry("uname", m_Username);
1106     data.AddEntry("upasswd", m_Password);
1107 
1108     // get authentication cookie
1109     CHttpResponse response = request.Execute();
1110 
1111     if (response.GetStatusCode() != 200) {
1112         *m_LogStream << "ERROR: Unable to login to MyNCBI." << endl;
1113         exit(6);
1114     }
1115 
1116     // BioSample update
1117     if (m_UseDevServer) {
1118         sUrl = "https://dev-api-int.ncbi.nlm.nih.gov/biosample/update/";
1119     } else {
1120         sUrl = "https://api-int.ncbi.nlm.nih.gov/biosample/update/";
1121     }
1122     string sContentType = "application/json; charset=utf-8";
1123 
1124     CHttpCookie m_cookie;
1125     m_cookie.Reset();
1126 
1127     // Getting cookies - need WebCubbyUser
1128     ITERATE(CHttpCookies, it, session.Cookies())
1129     {
1130         if ( it->GetName() == "WebCubbyUser")
1131         {
1132             m_cookie = *it;
1133             break;
1134         }
1135     }
1136 
1137     // send biosample request
1138     session.Cookies().Add(m_cookie);
1139     response = session.Post(sUrl, sData, sContentType);
1140 
1141     if (response.GetStatusCode() != 200) {
1142         NcbiStreamCopy(cout, response.ErrorStream());
1143         cout << endl;
1144     } else {
1145         NcbiStreamCopy(cout, response.ContentStream());
1146         cout << endl;
1147     }
1148 
1149     // MyNCBI signout
1150     sUrl = "https://www.ncbi.nlm.nih.gov/account/signout/";
1151     session.Get(sUrl);
1152 }
1153 
1154 
GetBioseqDiffs(CBioseq_Handle bh)1155 void CBiosampleChkApp::GetBioseqDiffs(CBioseq_Handle bh)
1156 {
1157     vector<string> unprocessed_ids;
1158     biosample_util::TBiosampleFieldDiffList new_diffs =
1159                   biosample_util::GetBioseqDiffs(bh,
1160                                        m_BioSampleAccession,
1161                                        m_Processed,
1162                                        unprocessed_ids,
1163                                        m_UseDevServer,
1164                                        m_CompareStructuredComments,
1165                                        m_StructuredCommentPrefix,
1166                                        &m_cache);
1167     if (! new_diffs.empty()) {
1168         m_Diffs.insert(m_Diffs.end(), new_diffs.begin(), new_diffs.end());
1169         for (const auto &id : unprocessed_ids) {
1170             *m_LogStream << "Failed to retrieve BioSample data for  " << id << endl;
1171         }
1172         m_Unprocessed += unprocessed_ids.size();
1173     }
1174 }
1175 
1176 
PushToRecord(CBioseq_Handle bh)1177 void CBiosampleChkApp::PushToRecord(CBioseq_Handle bh)
1178 {
1179     for (const auto &it : m_Descriptors) {
1180         if (it->IsSource()) {
1181             UpdateBioSource(bh, it->GetSource());
1182         }
1183     }
1184 }
1185 
1186 
ProcessBioseqForUpdate(CBioseq_Handle bh)1187 void CBiosampleChkApp::ProcessBioseqForUpdate(CBioseq_Handle bh)
1188 {
1189     vector<string> biosample_ids = biosample_util::GetBiosampleIDs(bh);
1190 
1191     if (!biosample_util::ResolveSuppliedBioSampleAccession(m_BioSampleAccession, biosample_ids)) {
1192         // error
1193         string label = biosample_util::GetBestBioseqLabel(bh);
1194         *m_LogStream << label << " has conflicting BioSample Accession " << biosample_ids[0] << endl;
1195         return;
1196     }
1197 
1198     if (biosample_ids.empty()) {
1199         // for report mode, do not report if no biosample ID
1200         return;
1201     }
1202 
1203     for (const auto &id : biosample_ids) {
1204         CRef<CSeq_descr> descr = biosample_util::GetBiosampleData(id, m_UseDevServer, &m_cache);
1205         if (descr) {
1206             m_Descriptors.clear();
1207             copy(descr->Set().begin(), descr->Set().end(),
1208                 back_inserter(m_Descriptors));
1209             PushToRecord(bh);
1210             m_Descriptors.clear();
1211         }
1212     }
1213 
1214 }
1215 
1216 
ProcessBioseqHandle(CBioseq_Handle bh)1217 void CBiosampleChkApp::ProcessBioseqHandle(CBioseq_Handle bh)
1218 {
1219     switch (m_Mode) {
1220         case e_report_diffs:
1221             GetBioseqDiffs(bh);
1222             break;
1223         case e_generate_biosample:
1224             try {
1225                 biosample_util::PrintBioseqXML(
1226                           bh,
1227                           m_IDPrefix,
1228                           m_ReportStream,
1229                           m_BioProjectAccession,
1230                           m_Owner,
1231                           m_HUPDate,
1232                           m_Comment,
1233                           m_FirstSeqOnly,
1234                           m_CompareStructuredComments,
1235                           m_StructuredCommentPrefix);
1236             } catch (CException& e) {
1237                 *m_LogStream << e.GetMsg() << endl;
1238             }
1239             break;
1240         case e_push:
1241             PushToRecord(bh);
1242             break;
1243         case e_take_from_biosample:
1244             m_Diffs.clear();
1245             GetBioseqDiffs(bh);
1246             if (biosample_util::DoDiffsContainConflicts(m_Diffs, m_LogStream)) {
1247                 m_ReturnCode = 1;
1248                 string sequence_id = biosample_util::GetBestBioseqLabel(bh);
1249                 *m_LogStream << "Conflicts found for  " << sequence_id << endl;
1250                 try {
1251                     biosample_util::AddBioseqToTable(
1252                                   bh, *m_Table,
1253                                   true,
1254                                   m_CompareStructuredComments,
1255                                   m_StructuredCommentPrefix);
1256                 } catch (CException& e) {
1257                     *m_LogStream << e.GetMsg() << endl;
1258                 }
1259             } else {
1260                 ProcessBioseqForUpdate(bh);
1261             }
1262             break;
1263         case e_take_from_biosample_force:
1264             ProcessBioseqForUpdate(bh);
1265             break;
1266         case e_update_with:
1267         case e_update_no:
1268             GetBioseqDiffs(bh);
1269             break;
1270         default:
1271             if (m_Handler != NULL) {
1272                 m_Handler->ProcessBioseq(bh);
1273             }
1274             break;
1275     }
1276 
1277 }
1278 
1279 
ProcessSeqEntry(CRef<CSeq_entry> se)1280 void CBiosampleChkApp::ProcessSeqEntry(CRef<CSeq_entry> se)
1281 {
1282     CRef<CScope> scope = BuildScope();
1283     CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*se);
1284     CBioseq_CI bi(seh, CSeq_inst::eMol_na);
1285     while (bi) {
1286         ProcessBioseqHandle(*bi);
1287         if (m_FirstSeqOnly) {
1288             break;
1289         }
1290         ++bi;
1291     }
1292     scope->RemoveTopLevelSeqEntry(seh);
1293 }
1294 
1295 
ProcessSeqEntry(void)1296 void CBiosampleChkApp::ProcessSeqEntry(void)
1297 {
1298     // Get seq-entry to process
1299     CRef<CSeq_entry> se(ReadSeqEntry());
1300 
1301     ProcessSeqEntry(se);
1302 
1303     // write out copy after processing, if requested
1304     if (m_AsnOut) {
1305         *m_AsnOut << *se;
1306     }
1307 }
1308 
1309 
ProcessSet(void)1310 void CBiosampleChkApp::ProcessSet(void)
1311 {
1312     // Get Bioseq-set to process
1313     CRef<CBioseq_set> set(ReadBioseqSet());
1314     if (set && set->IsSetSeq_set()) {
1315         for (const auto &se : set->GetSeq_set()) {
1316             ProcessSeqEntry(se);
1317         }
1318     }
1319 
1320     // write out copy after processing, if requested
1321     if (m_AsnOut) {
1322         *m_AsnOut << *set;
1323     }
1324 }
1325 
1326 
ProcessSeqSubmit(void)1327 void CBiosampleChkApp::ProcessSeqSubmit(void)
1328 {
1329     CRef<CSeq_submit> ss(new CSeq_submit);
1330 
1331     // Get seq-submit to process
1332     m_In->Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
1333 
1334     m_Owner = "";
1335     // get owner from Seq-submit to use if no pub is found
1336     if (ss->IsSetSub()) {
1337         if (ss->GetSub().IsSetCit()
1338             && ss->GetSub().GetCit().IsSetAuthors()
1339             && ss->GetSub().GetCit().GetAuthors().IsSetAffil()) {
1340             m_Owner = biosample_util::OwnerFromAffil(ss->GetSub().GetCit().GetAuthors().GetAffil());
1341         } else if (ss->GetSub().IsSetContact() && ss->GetSub().GetContact().IsSetContact()
1342             && ss->GetSub().GetContact().GetContact().IsSetAffil()) {
1343             m_Owner = biosample_util::OwnerFromAffil(ss->GetSub().GetContact().GetContact().GetAffil());
1344         }
1345     }
1346 
1347     // Process Seq-submit
1348     CRef<CScope> scope = BuildScope();
1349     if (ss->GetData().IsEntrys()) {
1350         for (const auto &se : ss->GetData().GetEntrys()) {
1351             ProcessSeqEntry(se);
1352         }
1353     }
1354     // write out copy after processing, if requested
1355     if (m_AsnOut) {
1356         *m_AsnOut << *ss;
1357     }
1358 }
1359 
s_IsEmptyBioSource(const CSeqdesc & src)1360 static bool s_IsEmptyBioSource(const CSeqdesc& src)
1361 {
1362     return !src.GetSource().IsSetSubtype() && !src.GetSource().IsSetGenome() && !src.GetSource().IsSetOrigin() &&
1363         (!src.GetSource().IsSetOrg() || (!src.GetSource().IsSetOrgname() && !src.GetSource().IsSetTaxname() && !src.GetSource().IsSetDivision()));
1364 }
1365 
UpdateBioSource(CBioseq_Handle bh,const CBioSource & src)1366 void CBiosampleChkApp::UpdateBioSource (CBioseq_Handle bh, const CBioSource& src)
1367 {
1368     CSeqdesc_CI src_desc_ci(bh, CSeqdesc::e_Source);
1369 
1370     CBioseq_EditHandle beh = bh.GetEditHandle();
1371     // Removes empty BioSources
1372     for (; src_desc_ci;) {
1373 
1374         if (s_IsEmptyBioSource(*src_desc_ci)) {
1375             const CSeqdesc& cur_descr = *src_desc_ci;
1376             ++src_desc_ci;
1377             beh.RemoveSeqdesc(cur_descr);
1378         }
1379         else {
1380             break;
1381         }
1382     }
1383 
1384     if (!src_desc_ci) {
1385         CRef<CSeqdesc> new_desc(new CSeqdesc());
1386         new_desc->SetSource().Assign(src);
1387         CBioseq_set_Handle parent = bh.GetParentBioseq_set();
1388 
1389         if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
1390             CBioseq_set_EditHandle bseh = parent.GetEditHandle();
1391             bseh.AddSeqdesc(*new_desc);
1392         } else {
1393             beh.AddSeqdesc(*new_desc);
1394         }
1395     } else {
1396 
1397         const CBioSource& bs = src_desc_ci->GetSource();
1398         CBioSource* old_src = const_cast<CBioSource *> (&bs);
1399         old_src->UpdateWithBioSample(src, true, true);
1400 
1401         // Removes the rest of empty BioSources
1402         for (++src_desc_ci; src_desc_ci;) {
1403 
1404             if (s_IsEmptyBioSource(*src_desc_ci)) {
1405                 const CSeqdesc& cur_descr = *src_desc_ci;
1406                 ++src_desc_ci;
1407                 beh.RemoveSeqdesc(cur_descr);
1408             }
1409             else {
1410                 ++src_desc_ci;
1411             }
1412         }
1413     }
1414 }
1415 
1416 
Setup(const CArgs & args)1417 void CBiosampleChkApp::Setup(const CArgs& args)
1418 {
1419     // Setup application registry and logs for CONNECT library
1420     CORE_SetLOG(LOG_cxx2c());
1421     CORE_SetREG(REG_cxx2c(&GetConfig(), false));
1422     // Setup MT-safety for CONNECT library
1423     // CORE_SetLOCK(MT_LOCK_cxx2c());
1424 
1425     // Create object manager
1426     m_ObjMgr = CObjectManager::GetInstance();
1427 }
1428 
1429 
OpenFile(const CArgs & args)1430 auto_ptr<CObjectIStream> CBiosampleChkApp::OpenFile(const CArgs& args)
1431 {
1432     string fname = args["i"].AsString();
1433     return CBiosampleChkApp::OpenFile(fname);
1434 }
1435 
OpenFile(const string & fname)1436 auto_ptr<CObjectIStream> CBiosampleChkApp::OpenFile(const string &fname)
1437 {
1438     ESerialDataFormat format = eSerial_AsnText;
1439 
1440     auto_ptr<CNcbiIstream> hold_stream(new CNcbiIfstream (fname.c_str(), ios::binary));
1441     CNcbiIstream* InputStream = hold_stream.get();
1442 
1443     CFormatGuess::EFormat formatGuess = CFormatGuess::Format(*InputStream);
1444 
1445     CCompressStream::EMethod method;
1446     switch (formatGuess)
1447     {
1448         case CFormatGuess::eGZip:  method = CCompressStream::eGZipFile;  break;
1449         case CFormatGuess::eBZip2: method = CCompressStream::eBZip2;     break;
1450         case CFormatGuess::eLzo:   method = CCompressStream::eLZO;       break;
1451         default:                   method = CCompressStream::eNone;      break;
1452     }
1453     if (method != CCompressStream::eNone)
1454     {
1455         CDecompressIStream* decompress(new CDecompressIStream(*InputStream, method, CCompressStream::fDefault, eTakeOwnership));
1456         hold_stream.release();
1457         hold_stream.reset(decompress);
1458         InputStream = hold_stream.get();
1459         formatGuess = CFormatGuess::Format(*InputStream);
1460     }
1461 
1462     auto_ptr<CObjectIStream> objectStream;
1463     switch (formatGuess)
1464     {
1465         case CFormatGuess::eBinaryASN:
1466             format = eSerial_AsnBinary;
1467         case CFormatGuess::eTextASN:
1468             format = eSerial_AsnText;
1469             objectStream.reset(CObjectIStream::Open(format, *InputStream, eTakeOwnership));
1470             hold_stream.release();
1471             break;
1472         default:
1473             break;
1474     }
1475     return objectStream;
1476 }
1477 
SaveFile(const string & fname,bool useBinaryOutputFormat)1478 void CBiosampleChkApp::SaveFile(const string &fname, bool useBinaryOutputFormat)
1479 {
1480     ios::openmode mode = ios::out;
1481     m_AsnOut = new CNcbiOfstream(fname.c_str(), mode);
1482     if (!m_AsnOut)
1483     {
1484         NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
1485     }
1486     if ( useBinaryOutputFormat ) {
1487         *m_AsnOut << MSerial_AsnBinary;
1488     } else {
1489         *m_AsnOut << MSerial_AsnText;
1490     }
1491 }
1492 
1493 
1494 /////////////////////////////////////////////////////////////////////////////
1495 //  MAIN
1496 
1497 
main(int argc,const char * argv[])1498 int main(int argc, const char* argv[])
1499 {
1500     return CBiosampleChkApp().AppMain(argc, argv, 0, eDS_Default, 0);
1501 }
1502