1 /* $Id: biosample_chk.cpp 580815 2019-02-21 12:30:55Z choi $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * check biosource and structured comment descriptors against biosample database
30 *
31 */
32
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include <corelib/ncbistre.hpp>
36 #include <corelib/ncbiapp.hpp>
37 #include <corelib/ncbienv.hpp>
38 #include <corelib/ncbiargs.hpp>
39 #include <corelib/ncbiutil.hpp>
40
41 #include <serial/serial.hpp>
42 #include <serial/objistr.hpp>
43 #include <serial/objectio.hpp>
44
45 #include <connect/ncbi_core_cxx.hpp>
46 #include <connect/ncbi_util.h>
47 #include <connect/ncbi_http_session.hpp>
48
49 // Objects includes
50 #include <objects/general/Object_id.hpp>
51 #include <objects/seq/Bioseq.hpp>
52 #include <objects/seqloc/Seq_id.hpp>
53 #include <objects/seqloc/Seq_loc.hpp>
54 #include <objects/seqloc/Seq_interval.hpp>
55 #include <objects/seq/Seq_inst.hpp>
56 #include <objects/seqfeat/BioSource.hpp>
57 #include <objects/seqfeat/SubSource.hpp>
58 #include <objects/seqfeat/Org_ref.hpp>
59 #include <objects/seqfeat/OrgName.hpp>
60 #include <objects/seqfeat/OrgMod.hpp>
61 #include <objects/seqfeat/PCRReactionSet.hpp>
62 #include <objects/seqfeat/PCRReaction.hpp>
63 #include <objects/seqfeat/PCRPrimer.hpp>
64 #include <objects/seqfeat/PCRPrimerSet.hpp>
65 #include <objects/seqfeat/PCRPrimerName.hpp>
66 #include <objects/seqfeat/PCRPrimerSeq.hpp>
67 #include <objects/seq/Pubdesc.hpp>
68 #include <objects/pub/Pub.hpp>
69 #include <objects/pub/Pub_equiv.hpp>
70 #include <objects/biblio/Cit_sub.hpp>
71 #include <objects/biblio/Cit_gen.hpp>
72 #include <objects/biblio/Auth_list.hpp>
73 #include <objects/biblio/Author.hpp>
74 #include <objects/biblio/Affil.hpp>
75 #include <objects/general/Person_id.hpp>
76 #include <objects/general/Name_std.hpp>
77 #include <objects/submit/Seq_submit.hpp>
78 #include <objects/submit/Submit_block.hpp>
79 #include <objects/submit/Contact_info.hpp>
80 #include <objects/seqset/Seq_entry.hpp>
81 #include <objtools/cleanup/cleanup.hpp>
82 #include <objects/seqtable/SeqTable_multi_data.hpp>
83 #include <objects/seqtable/SeqTable_column_info.hpp>
84 #include <util/line_reader.hpp>
85 #include <util/compress/stream_util.hpp>
86 #include <util/format_guess.hpp>
87
88 #include <objects/seqset/Bioseq_set.hpp>
89
90 // Object Manager includes
91 #include <objmgr/object_manager.hpp>
92 #include <objmgr/scope.hpp>
93 #include <objmgr/seq_descr_ci.hpp>
94 #include <objmgr/bioseq_handle.hpp>
95 #include <objmgr/bioseq_ci.hpp>
96 #include <objmgr/seqdesc_ci.hpp>
97
98 #include <objtools/data_loaders/genbank/gbloader.hpp>
99 #ifdef HAVE_NCBI_VDB
100 # include <sra/data_loaders/wgs/wgsloader.hpp>
101 #endif
102 #include <misc/jsonwrapp/jsonwrapp.hpp>
103 #include <misc/xmlwrapp/xmlwrapp.hpp>
104
105
106 #include <misc/biosample_util/biosample_util.hpp>
107 #include <misc/biosample_util/struc_table_column.hpp>
108
109 #include <common/test_assert.h> /* This header must go last */
110
111
112 using namespace ncbi;
113 using namespace objects;
114 using namespace xml;
115
116 const char * BIOSAMPLE_CHK_APP_VER = "1.0";
117
118 /////////////////////////////////////////////////////////////////////////////
119 //
120 // Demo application
121 //
122
123
124 class CBiosampleHandler
125 {
126 public:
CBiosampleHandler()127 CBiosampleHandler() :
128 m_ReportStream(0),
129 m_UseDevServer(false),
130 m_Username(""),
131 m_Password("")
132 {}
133
~CBiosampleHandler()134 virtual ~CBiosampleHandler() {}
135
ProcessBioseq(CBioseq_Handle bh)136 virtual void ProcessBioseq(CBioseq_Handle bh) {}
NeedsReportStream()137 virtual bool NeedsReportStream() { return false; }
AddSummary()138 virtual void AddSummary() {}
139
SetReportStream(CNcbiOstream * stream)140 void SetReportStream(CNcbiOstream* stream) { m_ReportStream = stream; }
141
142 protected:
143 CNcbiOstream* m_ReportStream;
144 bool m_UseDevServer;
145 string m_Username;
146 string m_Password;
147 };
148
149
150 class CBiosampleStatusReport : public CBiosampleHandler
151 {
152 public:
CBiosampleStatusReport()153 CBiosampleStatusReport() : CBiosampleHandler() {}
~CBiosampleStatusReport()154 virtual ~CBiosampleStatusReport() {}
155 virtual void ProcessBioseq(CBioseq_Handle bh);
NeedsReportStream()156 virtual bool NeedsReportStream() { return true; }
157 virtual void AddSummary();
158
159 protected:
160 biosample_util::TStatuses m_Status;
161 };
162
163
ProcessBioseq(CBioseq_Handle bsh)164 void CBiosampleStatusReport::ProcessBioseq(CBioseq_Handle bsh)
165 {
166 vector<string> ids = biosample_util::GetBiosampleIDs(bsh);
167 if (ids.empty()) {
168 return;
169 }
170
171 for (const auto &it : ids) {
172 if (m_Status.find(it) == m_Status.end()) {
173 biosample_util::TStatus new_pair(it, biosample_util::eStatus_Unknown);
174 m_Status.insert(new_pair);
175 }
176 }
177 }
178
AddSummary()179 void CBiosampleStatusReport::AddSummary()
180 {
181 if (m_Status.empty()) {
182 *m_ReportStream << "No BioSample IDs found" << endl;
183 } else {
184 biosample_util::GetBiosampleStatus(m_Status, m_UseDevServer);
185 biosample_util::TStatuses::iterator it = m_Status.begin();
186 while (it != m_Status.end()) {
187 *m_ReportStream << it->first << "\t" << biosample_util::GetBiosampleStatusName(it->second) << endl;
188 ++it;
189 }
190 }
191 m_Status.clear();
192 }
193
194
195 class CBiosampleChkApp : public CNcbiApplication, CReadClassMemberHook
196 {
197 public:
198 CBiosampleChkApp(void);
199
200 virtual void Init(void);
201 virtual int Run (void);
202
203 void ReadClassMember(CObjectIStream& in,
204 const CObjectInfo::CMemberIterator& member);
205
206 private:
207
208 void Setup(const CArgs& args);
209
210 auto_ptr<CObjectIStream> OpenFile(const CArgs& args);
211 auto_ptr<CObjectIStream> OpenFile(const string &fname);
212 void SaveFile(const string &fname, bool useBinaryOutputFormat);
213
214 void GetBioseqDiffs(CBioseq_Handle bh);
215 void PushToRecord(CBioseq_Handle bh);
216
217 void ProcessBioseqForUpdate(CBioseq_Handle bh);
218 void ProcessBioseqHandle(CBioseq_Handle bh);
219 void ProcessSeqEntry(CRef<CSeq_entry> se);
220 void ProcessSeqEntry(void);
221 void ProcessSet(void);
222 void ProcessSeqSubmit(void);
223 void ProcessAsnInput (void);
224 void ProcessList (const string& fname);
225 void ProcessFileList (const string& fname);
226 int ProcessOneDirectory(const string& dir_name, const string& file_suffix, const string& file_mask, bool recurse);
227 void ProcessOneFile(string fname);
228 void ProcessReleaseFile(const CArgs& args);
229 CRef<CSeq_entry> ReadSeqEntry(void);
230 CRef<CBioseq_set> ReadBioseqSet(void);
231
232 void CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList& diffs, bool del_okay);
233 void PrintResults(biosample_util::TBiosampleFieldDiffList& diffs);
234 void PrintDiffs(biosample_util::TBiosampleFieldDiffList& diffs);
235 void PrintTable(CRef<CSeq_table> table);
236
237 CRef<CScope> BuildScope(void);
238
239 // for mode 3, biosample_push
240 void UpdateBioSource (CBioseq_Handle bh, const CBioSource& src);
241 vector<CRef<CSeqdesc> > GetBiosampleDescriptors(string fname);
242 vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqSubmit();
243 vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqEntry();
244 vector<CRef<CSeqdesc> > GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry& se);
245
246 CRef<CObjectManager> m_ObjMgr;
247 auto_ptr<CObjectIStream> m_In;
248 bool m_Continue;
249
250 size_t m_Level;
251
252 CNcbiOstream* m_ReportStream;
253 bool m_NeedReportHeader;
254 CNcbiOfstream* m_AsnOut;
255 CNcbiOstream* m_LogStream;
256
257 enum E_Mode {
258 e_report_diffs = 1, // Default - report diffs between biosources on records with biosample accessions
259 // and biosample data
260 e_generate_biosample,
261 e_push,
262 e_take_from_biosample, // update with qualifiers from BioSample, stop if conflict
263 e_take_from_biosample_force, // update with qualifiers from BioSample, no stop on conflict
264 e_report_status, // make table with list of BioSample IDs and statuses
265 e_update_with, // use web API for update (with delete)
266 e_update_no // use web API for update (no delete)
267 };
268
269 enum E_ListType {
270 e_none = 0,
271 e_accessions,
272 e_files
273 };
274
275 int m_Mode;
276 int m_ReturnCode;
277 int m_ListType;
278 string m_StructuredCommentPrefix;
279 bool m_CompareStructuredComments;
280 bool m_UseDevServer;
281 bool m_FirstSeqOnly;
282 string m_Username;
283 string m_Password;
284 string m_IDPrefix;
285 string m_HUPDate;
286 string m_BioSampleAccession;
287 string m_BioProjectAccession;
288 string m_Owner;
289 string m_Comment;
290
291 size_t m_Processed;
292 size_t m_Unprocessed;
293
294 biosample_util::TBiosampleFieldDiffList m_Diffs;
295 CRef<CSeq_table> m_Table;
296 vector<CRef<CSeqdesc> > m_Descriptors;
297
298 CBiosampleHandler * m_Handler;
299
300 biosample_util::TBioSamples m_cache;
301 };
302
303
CBiosampleChkApp(void)304 CBiosampleChkApp::CBiosampleChkApp(void) :
305 m_ObjMgr(0), m_In(0), m_Continue(false),
306 m_Level(0), m_ReportStream(0), m_NeedReportHeader(true), m_AsnOut(0),
307 m_LogStream(0), m_Mode(e_report_diffs), m_ReturnCode(0),
308 m_StructuredCommentPrefix(""), m_CompareStructuredComments(true),
309 m_FirstSeqOnly(false), m_IDPrefix(""), m_HUPDate(""),
310 m_BioSampleAccession(""), m_BioProjectAccession(""),
311 m_Owner(""), m_Comment(""),
312 m_Processed(0), m_Unprocessed(0), m_Handler(NULL)
313 {
314 }
315
316
Init(void)317 void CBiosampleChkApp::Init(void)
318 {
319 // Prepare command line descriptions
320
321 // Create
322 auto_ptr<CArgDescriptions> arg_desc(new CArgDescriptions);
323
324 arg_desc->AddOptionalKey
325 ("p", "Directory", "Path to ASN.1 Files",
326 CArgDescriptions::eInputFile);
327 arg_desc->AddOptionalKey
328 ("i", "InFile", "Single Input File",
329 CArgDescriptions::eInputFile);
330 arg_desc->AddOptionalKey(
331 "o", "OutFile", "Single Output File",
332 CArgDescriptions::eOutputFile);
333 arg_desc->AddOptionalKey(
334 "f", "Filter", "Substring Filter",
335 CArgDescriptions::eOutputFile);
336 arg_desc->AddDefaultKey
337 ("x", "String", "File Selection Substring", CArgDescriptions::eString, ".sqn");
338 arg_desc->AddFlag("u", "Recurse");
339 arg_desc->AddFlag("d", "Use development Biosample server");
340
341 arg_desc->AddDefaultKey("a", "a",
342 "ASN.1 Type (a Automatic, z Any, e Seq-entry, b Bioseq, s Bioseq-set, m Seq-submit, t Batch Bioseq-set, u Batch Seq-submit) or accession list (l)",
343 CArgDescriptions::eString,
344 "a");
345
346 arg_desc->AddFlag("b", "Output binary ASN.1");
347 //arg_desc->AddFlag("c", "Batch File is Compressed");
348 arg_desc->AddFlag("M", "Process only first sequence in file (master)");
349 arg_desc->AddOptionalKey("R", "BioSampleIDPrefix", "BioSample ID Prefix", CArgDescriptions::eString);
350 arg_desc->AddOptionalKey("HUP", "HUPDate", "Hold Until Publish Date", CArgDescriptions::eString);
351
352 arg_desc->AddOptionalKey(
353 "L", "OutFile", "Log File",
354 CArgDescriptions::eOutputFile);
355
356 arg_desc->AddDefaultKey(
357 "m", "mode", "Mode:\n"
358 "\t1 create update file\n"
359 "\t2 generate file for creating new biosample entries\n"
360 "\t3 push source info from one file (-i) to others (-p)\n"
361 "\t4 update with source qualifiers from BioSample unless conflict\n"
362 "\t5 update with source qualifiers from BioSample (continue with conflict))\n"
363 "\t6 report transaction status\n"
364 "\t7 use web API for update (with delete)\n"
365 "\t8 use web API for update (no delete)\n",
366 CArgDescriptions::eInteger, "1");
367 CArgAllow* constraint = new CArgAllow_Integers(e_report_diffs, e_update_no);
368 arg_desc->SetConstraint("m", constraint);
369
370 arg_desc->AddOptionalKey(
371 "P", "Prefix", "StructuredCommentPrefix", CArgDescriptions::eString);
372
373 arg_desc->AddOptionalKey(
374 "biosample", "BioSampleAccession", "BioSample Accession to use for sequences in record. Report error if sequences contain a reference to a different BioSample accession.", CArgDescriptions::eString);
375 arg_desc->AddOptionalKey(
376 "bioproject", "BioProjectAccession", "BioProject Accession to use for sequences in record. Report error if sequences contain a reference to a different BioProject accession.", CArgDescriptions::eString);
377 arg_desc->AddOptionalKey("comment", "BioSampleComment", "Comment to use for creating new BioSample xml", CArgDescriptions::eString);
378
379 arg_desc->AddOptionalKey
380 ("authorize", "AuthorizeFile", "Username and Password File",
381 CArgDescriptions::eInputFile);
382
383 arg_desc->AddOptionalKey("username", "ApiUsername", "Username", CArgDescriptions::eString);
384 arg_desc->AddOptionalKey("password", "ApiPassword", "Password", CArgDescriptions::eString);
385
386 // Program description
387 string prog_description = "BioSample Checker\n";
388 arg_desc->SetUsageContext(GetArguments().GetProgramBasename(),
389 prog_description, false);
390
391 // Pass argument descriptions to the application
392 SetupArgDescriptions(arg_desc.release());
393
394 }
395
396
ProcessAsnInput(void)397 void CBiosampleChkApp::ProcessAsnInput (void)
398 {
399 // Process file based on its content
400 // Unless otherwise specifien we assume the file in hand is
401 // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
402 // Release file (batch processing) where we process each Seq-entry
403 // at a time.
404 string header = m_In->ReadFileHeader();
405
406 bool unhandled = false;
407 try {
408 if (header == "Seq-submit" ) { // Seq-submit
409 ProcessSeqSubmit();
410 } else if ( header == "Seq-entry" ) { // Seq-entry
411 ProcessSeqEntry();
412 } else if (header == "Bioseq-set" ) { // Bioseq-set
413 ProcessSet();
414 } else {
415 unhandled = true;
416 }
417 } catch (CException& e) {
418 if (NStr::StartsWith(e.GetMsg(), "duplicate Bioseq id")) {
419 *m_LogStream << e.GetMsg();
420 exit(4);
421 } else {
422 throw e;
423 }
424 }
425 if (unhandled) {
426 NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
427 }
428
429 }
430
431
ProcessList(const string & fname)432 void CBiosampleChkApp::ProcessList (const string& fname)
433 {
434 // Process file with list of accessions
435
436 CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
437 CGBDataLoader::RegisterInObjectManager(*objmgr);
438 #ifdef HAVE_NCBI_VDB
439 CWGSDataLoader::RegisterInObjectManager(*objmgr);
440 #endif
441 CScope scope(*objmgr);
442 scope.AddDefaults();
443
444 CRef<ILineReader> lr = ILineReader::New (fname);
445 while ( !lr->AtEOF() ) {
446 CTempString line = *++*lr;
447 if (!NStr::IsBlank(line)) {
448 try {
449 CRef<CSeq_id> id(new CSeq_id(line));
450 if (id) {
451 CBioseq_Handle bsh = scope.GetBioseqHandle(*id);
452 if (bsh) {
453 ProcessBioseqHandle(bsh);
454 } else {
455 *m_LogStream << "Unable to fetch Bioseq for " << line << endl;
456 string label = "";
457 id->GetLabel(&label);
458 *m_LogStream << " (interpreted as " << label << ")" << endl;
459 m_Unprocessed++;
460 }
461 }
462 } catch (CException& e) {
463 *m_LogStream << e.GetMsg() << endl;
464 m_Unprocessed++;
465 }
466 }
467 }
468
469 }
470
471
ProcessFileList(const string & fname)472 void CBiosampleChkApp::ProcessFileList (const string& fname)
473 {
474 // Process file with list of files
475
476 CRef<CObjectManager> objmgr = CObjectManager::GetInstance();
477 CGBDataLoader::RegisterInObjectManager(*objmgr);
478 CScope scope(*objmgr);
479 scope.AddDefaults();
480
481 m_ListType = e_none;
482 CRef<ILineReader> lr = ILineReader::New (fname);
483 while ( !lr->AtEOF() ) {
484 CTempString line = *++*lr;
485 if (!NStr::IsBlank(line)) {
486 ProcessOneFile(line);
487 }
488 }
489 m_ListType = e_files;
490 }
491
492
ProcessOneFile(string fname)493 void CBiosampleChkApp::ProcessOneFile(string fname)
494 {
495 const CArgs& args = GetArgs();
496
497 bool need_to_close_report = false;
498 bool need_to_close_asn = false;
499
500 if (!m_ReportStream &&
501 (m_Mode == e_report_diffs || m_Mode == e_update_with || m_Mode == e_update_no || m_Mode == e_take_from_biosample || m_Mode == e_report_status ||
502 (m_Handler != NULL && m_Handler->NeedsReportStream()))) {
503 string path = fname;
504 size_t pos = NStr::Find(path, ".", NStr::eCase, NStr::eReverseSearch);
505 if (pos != string::npos) {
506 path = path.substr(0, pos);
507 }
508 path = path + ".val";
509 m_Table.Reset(new CSeq_table());
510 m_Table->SetNum_rows(0);
511 m_ReportStream = new CNcbiOfstream(path.c_str());
512 if (!m_ReportStream)
513 {
514 NCBI_THROW(CException, eUnknown, "Unable to open " + path);
515 }
516 need_to_close_report = true;
517 m_NeedReportHeader = true;
518 if (m_Handler && m_Handler->NeedsReportStream()) {
519 m_Handler->SetReportStream(m_ReportStream);
520 }
521 }
522 if (!m_AsnOut && (m_Mode == e_push || m_Mode == e_take_from_biosample || m_Mode == e_take_from_biosample_force)) {
523 string path = fname;
524 size_t pos = NStr::Find(path, ".", NStr::eCase, NStr::eReverseSearch);
525 if (pos != string::npos) {
526 path = path.substr(0, pos);
527 }
528 path = path + ".out";
529 SaveFile(path, args["b"]);
530 need_to_close_asn = true;
531 }
532
533 m_Diffs.clear();
534 switch (m_ListType) {
535 case e_accessions:
536 ProcessList (fname);
537 break;
538 case e_files:
539 ProcessFileList (fname);
540 break;
541 case e_none:
542 m_In = OpenFile(fname);
543 if (m_In.get() == nullptr) {
544 NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
545 }
546 if (!m_In->InGoodState()) {
547 NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
548 }
549 ProcessAsnInput();
550 break;
551 }
552
553 if (m_Mode == e_report_diffs) {
554 PrintResults(m_Diffs);
555 }
556 if (m_Mode == e_update_with) {
557 CreateBiosampleUpdateWebService(m_Diffs, true);
558 } else if (m_Mode == e_update_no) {
559 CreateBiosampleUpdateWebService(m_Diffs, false);
560 }
561 if (m_Handler != NULL) {
562 m_Handler->AddSummary();
563 }
564
565 // TODO! Must free diffs
566 m_Diffs.clear();
567
568 if (need_to_close_report) {
569 if (m_Mode == e_take_from_biosample) {
570 PrintTable(m_Table);
571 m_Table->Reset();
572 m_Table = new CSeq_table();
573 m_Table->SetNum_rows(0);
574 }
575 m_ReportStream->flush();
576 m_ReportStream = 0;
577 }
578 if (need_to_close_asn) {
579 m_AsnOut->flush();
580 m_AsnOut->close();
581 m_AsnOut = 0;
582 }
583 }
584
585
GetBiosampleDescriptorsFromSeqEntry(void)586 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptorsFromSeqEntry(void)
587 {
588 // Get seq-entry to process
589 CRef<CSeq_entry> se(ReadSeqEntry());
590
591 return GetBiosampleDescriptorsFromSeqEntry(*se);
592 }
593
594
GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry & se)595 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptorsFromSeqEntry(const CSeq_entry& se)
596 {
597 vector<CRef<CSeqdesc> > descriptors;
598
599 CRef<CScope> scope = BuildScope();
600 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(se);
601 CBioseq_CI bi(seh, CSeq_inst::eMol_na);
602 if (bi) {
603 CSeqdesc_CI src_desc_ci(*bi, CSeqdesc::e_Source);
604 if (src_desc_ci) {
605 CRef<CSeqdesc> src_desc(new CSeqdesc());
606 src_desc->Assign(*src_desc_ci);
607 descriptors.push_back(src_desc);
608 }
609 }
610
611 return descriptors;
612 }
613
614
GetBiosampleDescriptorsFromSeqSubmit()615 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptorsFromSeqSubmit()
616 {
617 vector<CRef<CSeqdesc> > descriptors;
618 CRef<CSeq_submit> ss(new CSeq_submit);
619
620 // Get seq-submit to process
621 m_In->Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
622
623 // Validae Seq-submit
624 CRef<CScope> scope = BuildScope();
625 if (ss->GetData().IsEntrys() && ! ss->GetData().GetEntrys().empty()) {
626 descriptors = GetBiosampleDescriptorsFromSeqEntry(**(ss->GetData().GetEntrys().begin()));
627 }
628 return descriptors;
629 }
630
631
GetBiosampleDescriptors(string fname)632 vector<CRef<CSeqdesc> > CBiosampleChkApp::GetBiosampleDescriptors(string fname)
633 {
634 m_In = OpenFile(fname);
635
636 // Process file based on its content
637 // Unless otherwise specifien we assume the file in hand is
638 // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
639 // Release file (batch processing) where we process each Seq-entry
640 // at a time.
641
642 string header = m_In->ReadFileHeader();
643
644 vector<CRef<CSeqdesc> > descriptors;
645 if (header == "Seq-submit" ) { // Seq-submit
646 descriptors = GetBiosampleDescriptorsFromSeqSubmit();
647 } else if ( header == "Seq-entry" ) { // Seq-entry
648 descriptors = GetBiosampleDescriptorsFromSeqEntry();
649
650 } else {
651 NCBI_THROW(CException, eUnknown, "Unhandled type " + header);
652 }
653 return descriptors;
654 }
655
656
ProcessOneDirectory(const string & dir_name,const string & file_suffix,const string & file_mask,bool recurse)657 int CBiosampleChkApp::ProcessOneDirectory(const string& dir_name, const string& file_suffix, const string& file_mask, bool recurse)
658 {
659 int num_of_files = 0;
660
661 CDir dir(dir_name);
662 CDir::TEntries files (dir.GetEntries(file_mask, CDir::eFile));
663 for (const auto &ii : files) {
664 string fname = ii->GetName();
665 if (ii->IsFile() &&
666 (!file_suffix.empty() || NStr::Find (fname, file_suffix) != string::npos)) {
667 ++num_of_files;
668 string fname = CDirEntry::MakePath(dir_name, ii->GetName());
669 ProcessOneFile (fname);
670 }
671 }
672 if (recurse) {
673 CDir::TEntries subdirs (dir.GetEntries("", CDir::eDir));
674 for (const auto &ii : subdirs) {
675 string subdir = ii->GetName();
676 if (ii->IsDir() && !NStr::Equal(subdir, ".") && !NStr::Equal(subdir, "..")) {
677 string subname = CDirEntry::MakePath(dir_name, ii->GetName());
678 num_of_files += ProcessOneDirectory (subname, file_suffix, file_mask, recurse);
679 }
680 }
681 }
682 if (!num_of_files)
683 {
684 NCBI_THROW(CException, eUnknown, "No input '" + file_mask + "' files found in directory '" + dir_name + "'");
685 }
686 return num_of_files;
687 }
688
689
Run(void)690 int CBiosampleChkApp::Run(void)
691 {
692 const CArgs& args = GetArgs();
693 Setup(args);
694
695 m_Mode = args["m"].AsInteger();
696 m_FirstSeqOnly = args["M"].AsBoolean();
697 m_IDPrefix = args["R"] ? args["R"].AsString() : "";
698 m_HUPDate = args["HUP"] ? args["HUP"].AsString() : "";
699 m_BioSampleAccession = args["biosample"] ? args["biosample"].AsString() : "";
700 m_BioProjectAccession = args["bioproject"] ? args["bioproject"].AsString() : "";
701 m_Comment = args["comment"] ? args["comment"].AsString() : "";
702
703 if (m_Mode == e_report_status) {
704 m_Handler = new CBiosampleStatusReport();
705 }
706
707 if (args["o"]) {
708 if (m_Mode == e_report_diffs || m_Mode == e_generate_biosample
709 //|| m_Mode == e_take_from_biosample
710 || (m_Handler != NULL && m_Handler->NeedsReportStream())) {
711 m_ReportStream = &(args["o"].AsOutputFile());
712 if (!m_ReportStream)
713 {
714 NCBI_THROW(CException, eUnknown, "Unable to open " + args["o"].AsString());
715 }
716 if (m_Handler) {
717 m_Handler->SetReportStream(m_ReportStream);
718 }
719 if (m_Mode == e_take_from_biosample) {
720 m_Table.Reset(new CSeq_table());
721 m_Table->SetNum_rows(0);
722 }
723 } else {
724 SaveFile(args["o"].AsString(), args["b"]);
725 }
726 } else if (m_Mode == e_update_with || m_Mode == e_update_no) {
727 m_ReportStream = &NcbiCout;
728 if (!m_ReportStream)
729 {
730 NCBI_THROW(CException, eUnknown, "Unable to open " + args["o"].AsString());
731 }
732 if (m_Handler) {
733 m_Handler->SetReportStream(m_ReportStream);
734 }
735 if (m_Mode == e_take_from_biosample) {
736 m_Table.Reset(new CSeq_table());
737 m_Table->SetNum_rows(0);
738 }
739 }
740
741 m_LogStream = args["L"] ? &(args["L"].AsOutputFile()) : &NcbiCout;
742 m_StructuredCommentPrefix = args["P"] ? args["P"].AsString() : "";
743 if (!NStr::IsBlank(m_StructuredCommentPrefix) && !NStr::StartsWith(m_StructuredCommentPrefix, "##")) {
744 m_StructuredCommentPrefix = "##" + m_StructuredCommentPrefix;
745 }
746
747 m_UseDevServer = args["d"].AsBoolean();
748
749 if (args["authorize"]) {
750 CNcbiIfstream infile(args["authorize"].AsString().c_str());
751 string line;
752 while (NcbiGetlineEOL(infile, line)) {
753 if (m_Username.empty()) {
754 m_Username = line;
755 } else if (m_Password.empty()) {
756 m_Password = line;
757 } else {
758 break;
759 }
760 }
761 } else {
762 m_Username = args["username"] ? args["username"].AsString() : "";
763 m_Password = args["password"] ? args["password"].AsString() : "";
764 }
765 NStr::TruncateSpacesInPlace(m_Username);
766 NStr::TruncateSpacesInPlace(m_Password);
767
768 if (!NStr::IsBlank(m_StructuredCommentPrefix) && m_Mode != e_generate_biosample) {
769 // error
770 *m_LogStream << "Structured comment prefix is only appropriate for generating a biosample table." << endl;
771 return 1;
772 }
773
774 if (m_Mode == e_report_diffs) {
775 m_CompareStructuredComments = false;
776 }
777
778 // Process file based on its content
779 // Unless otherwise specified we assume the file in hand is
780 // a Seq-entry ASN.1 file, other option are a Seq-submit or NCBI
781 // Release file (batch processing) where we process each Seq-entry
782 // at a time.
783 if (NStr::Equal(args["a"].AsString(), "l")) {
784 m_ListType = e_accessions;
785 } else if (NStr::Equal(args["a"].AsString(), "f")) {
786 m_ListType = e_files;
787 } else {
788 m_ListType = e_none;
789 }
790
791 string dir_name = (args["p"]) ? args["p"].AsString() : "";
792 string file_suffix = (args["f"]) ? args["f"].AsString() : "";
793 string file_mask = (args["x"]) ? args["x"].AsString() : ".sqn";
794 file_mask = "*" + file_mask;
795 bool dir_recurse = args["u"];
796 if (m_Mode == e_report_status && !NStr::IsBlank(m_BioSampleAccession)) {
797 biosample_util::EStatus status = biosample_util::GetBiosampleStatus(m_BioSampleAccession, m_UseDevServer);
798 if (m_ReportStream) {
799 *m_ReportStream << m_BioSampleAccession << "\t" << biosample_util::GetBiosampleStatusName(status) << endl;
800 } else {
801 NcbiCout << m_BioSampleAccession << "\t" << biosample_util::GetBiosampleStatusName(status) << endl;
802 }
803 } else if ( m_Mode == e_push) {
804 if (m_ListType != e_none) {
805 // error
806 *m_LogStream << "List type (-a l or -a f) is not appropriate for push mode." << endl;
807 return 1;
808 } else if (!args["p"] || !args["i"]) {
809 // error
810 *m_LogStream << "Both directory containing contigs (-p) and master file (-i) are required for push mode." << endl;
811 return 1;
812 } else {
813 m_Descriptors = GetBiosampleDescriptors(args["i"].AsString());
814 ProcessOneDirectory (dir_name, file_suffix, file_mask, dir_recurse);
815 }
816 } else if ( args["p"] ) {
817 ProcessOneDirectory (dir_name, file_suffix, file_mask, dir_recurse);
818 if (m_Mode == e_take_from_biosample) {
819 if (m_Table && m_Table->GetNum_rows() > 0) {
820 PrintTable(m_Table);
821 }
822 }
823 } else {
824 if (args["i"]) {
825 ProcessOneFile (args["i"].AsString());
826 if (m_Mode == e_take_from_biosample) {
827 if (m_Table && m_Table->GetNum_rows() > 0) {
828 PrintTable(m_Table);
829 }
830 }
831 }
832 }
833
834 if (m_Unprocessed > 0) {
835 if (m_Mode != e_report_diffs) {
836 *m_LogStream << m_Unprocessed << " results failed" << endl;
837 }
838 return 1;
839 } else {
840 return m_ReturnCode;
841 }
842 }
843
844
BuildScope(void)845 CRef<CScope> CBiosampleChkApp::BuildScope (void)
846 {
847 CRef<CScope> scope(new CScope (*m_ObjMgr));
848 scope->AddDefaults();
849
850 return scope;
851 }
852
853
ReadClassMember(CObjectIStream & in,const CObjectInfo::CMemberIterator & member)854 void CBiosampleChkApp::ReadClassMember
855 (CObjectIStream& in,
856 const CObjectInfo::CMemberIterator& member)
857 {
858 m_Level++;
859
860 if ( m_Level == 1 ) {
861 size_t n = 0;
862 // Read each element separately to a local TSeqEntry,
863 // process it somehow, and... not store it in the container.
864 for ( CIStreamContainerIterator i(in, member); i; ++i ) {
865 try {
866 // Get seq-entry to process
867 CRef<CSeq_entry> se(new CSeq_entry);
868 i >> *se;
869
870 CStopWatch sw(CStopWatch::eStart);
871
872 m_Diffs.clear();
873 ProcessSeqEntry(se);
874 PrintResults(m_Diffs);
875 // TODO! Must free diffs
876 m_Diffs.clear();
877
878 if (m_ReportStream) {
879 *m_ReportStream << "Elapsed = " << sw.Elapsed() << endl;
880 }
881 n++;
882 } catch (std::exception e) {
883 if ( !m_Continue ) {
884 throw;
885 }
886 // should we issue some sort of warning?
887 }
888 }
889 } else {
890 in.ReadClassMember(member);
891 }
892
893 m_Level--;
894 }
895
896
ProcessReleaseFile(const CArgs & args)897 void CBiosampleChkApp::ProcessReleaseFile
898 (const CArgs& args)
899 {
900 CRef<CBioseq_set> seqset(new CBioseq_set);
901
902 // Register the Seq-entry hook
903 CObjectTypeInfo set_type = CType<CBioseq_set>();
904 set_type.FindMember("seq-set").SetLocalReadHook(*m_In, this);
905
906 // Read the CBioseq_set, it will call the hook object each time we
907 // encounter a Seq-entry
908 *m_In >> *seqset;
909 }
910
911
ReadSeqEntry(void)912 CRef<CSeq_entry> CBiosampleChkApp::ReadSeqEntry(void)
913 {
914 CRef<CSeq_entry> se(new CSeq_entry);
915 m_In->Read(ObjectInfo(*se), CObjectIStream::eNoFileHeader);
916
917 return se;
918 }
919
920
ReadBioseqSet(void)921 CRef<CBioseq_set> CBiosampleChkApp::ReadBioseqSet(void)
922 {
923 CRef<CBioseq_set> set(new CBioseq_set());
924 m_In->Read(ObjectInfo(*set), CObjectIStream::eNoFileHeader);
925
926 return set;
927 }
928
929
PrintTable(CRef<CSeq_table> table)930 void CBiosampleChkApp::PrintTable(CRef<CSeq_table> table)
931 {
932 if (table->GetNum_rows() == 0) {
933 // do nothing
934 return;
935 }
936
937 for (const auto &it : table->GetColumns()) {
938 *m_ReportStream << it->GetHeader().GetTitle() << "\t";
939 }
940 *m_ReportStream << endl;
941 for (size_t row = 0; row < (size_t)table->GetNum_rows(); row++) {
942 for (const auto &it : table->GetColumns()) {
943 if (row < it->GetData().GetString().size()) {
944 *m_ReportStream << it->GetData().GetString()[row] << "\t";
945 } else {
946 *m_ReportStream << "\t";
947 }
948 }
949 *m_ReportStream << endl;
950 }
951 }
952
953
PrintDiffs(biosample_util::TBiosampleFieldDiffList & diffs)954 void CBiosampleChkApp::PrintDiffs(biosample_util::TBiosampleFieldDiffList & diffs)
955 {
956 if (diffs.empty()) {
957 if (m_Processed == 0) {
958 *m_ReportStream << "No results processed" << endl;
959 } else {
960 *m_ReportStream << "No differences found" << endl;
961 }
962 } else {
963 if (m_NeedReportHeader) {
964 biosample_util::CBiosampleFieldDiff::PrintHeader(*m_ReportStream, false);
965 m_NeedReportHeader = false;
966 }
967
968 for (const auto &it : diffs) {
969 it->Print(*m_ReportStream, false);
970 }
971 }
972 if (m_Unprocessed > 0) {
973 *m_ReportStream << m_Unprocessed << " results failed" << endl;
974 }
975 }
976
977
PrintResults(biosample_util::TBiosampleFieldDiffList & diffs)978 void CBiosampleChkApp::PrintResults(biosample_util::TBiosampleFieldDiffList & diffs)
979 {
980 PrintDiffs(diffs);
981 }
982
983
CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList & diffs,bool del_okay)984 void CBiosampleChkApp::CreateBiosampleUpdateWebService(biosample_util::TBiosampleFieldDiffList & diffs, bool del_okay)
985 {
986 if (diffs.empty()) {
987 return;
988 }
989
990 vector< CRef<biosample_util::CBiosampleFieldDiff> > add_item;
991 vector< CRef<biosample_util::CBiosampleFieldDiff> > change_item;
992 vector< CRef<biosample_util::CBiosampleFieldDiff> > delete_item;
993 vector< CRef<biosample_util::CBiosampleFieldDiff> > change_organism;
994
995 set<string> ids;
996
997 for (const auto &it : diffs) {
998 string id = it->GetBioSample();
999 string smp = it->GetSampleVal();
1000 string src = it->GetSrcVal();
1001 string fld = it->GetFieldName();
1002 bool blank_smp = NStr::IsBlank(smp);
1003 bool blank_src = NStr::IsBlank(src);
1004 if (blank_smp && blank_src) {
1005 continue;
1006 }
1007 if (smp == src) {
1008 continue;
1009 }
1010 ids.insert(id);
1011 if (fld == "Organism Name") {
1012 change_organism.push_back(it);
1013 } else if (blank_smp) {
1014 add_item.push_back(it);
1015 } else if (blank_src) {
1016 if (del_okay) {
1017 delete_item.push_back(it);
1018 }
1019 } else {
1020 change_item.push_back(it);
1021 }
1022 }
1023
1024 CJson_Document req;
1025 CJson_Object top_obj = req.SetObject();
1026 CJson_Array biosample_array = top_obj.insert_array("update");
1027
1028 CJson_Object options_obj = top_obj.insert_object("options");
1029 options_obj.insert("attribute_synonyms", "true");
1030
1031 for (auto& id : ids) {
1032 CJson_Object obj1 = biosample_array.push_back_object();
1033 obj1.insert("samples", id);
1034
1035 if (! add_item.empty()) {
1036 CJson_Object add_obj = obj1.insert_object("add");
1037 CJson_Array add_arr = add_obj.insert_array("attribute");
1038 for (auto& itm : add_item) {
1039 CJson_Object obj2 = add_arr.push_back_object();
1040 obj2.insert("name", itm->GetFieldName());
1041 obj2.insert("new_value", itm->GetSrcVal());
1042 }
1043 }
1044
1045 if (! delete_item.empty()) {
1046 CJson_Object del_obj = obj1.insert_object("delete");
1047 CJson_Array del_arr = del_obj.insert_array("attribute");
1048 for (auto& itm : delete_item) {
1049 CJson_Object obj2 = del_arr.push_back_object();
1050 obj2.insert("name", itm->GetFieldName());
1051 obj2.insert("old_value", itm->GetSampleVal());
1052 }
1053 }
1054
1055 if (! change_item.empty() || ! change_organism.empty()) {
1056 CJson_Object chg_obj = obj1.insert_object("change");
1057 if (! change_organism.empty()) {
1058 CJson_Object chg_org = chg_obj.insert_object("organism");
1059 for (auto& itm : change_organism) {
1060 chg_org.insert("new_value", itm->GetSrcVal());
1061 }
1062 }
1063 if (! change_item.empty()) {
1064 CJson_Array chg_arr = chg_obj.insert_array("attribute");
1065 for (auto& itm : change_item) {
1066 string fld = itm->GetFieldName();
1067 if (fld == "Tax ID") {
1068 continue;
1069 }
1070 CJson_Object obj2 = chg_arr.push_back_object();
1071 obj2.insert("name", fld);
1072 obj2.insert("old_value", itm->GetSampleVal());
1073 obj2.insert("new_value", itm->GetSrcVal());
1074 }
1075 }
1076 }
1077 }
1078
1079 if ( ids.size() > 1 ) {
1080 *m_LogStream << "ERROR: More than one BioSample ID is not supported by -m 7." << endl;
1081 exit(6);
1082 }
1083
1084 string sData = req.ToString();
1085
1086 NcbiCout << sData << endl;
1087
1088 CHttpSession session;
1089
1090 if (m_Username == "" || m_Password == "") {
1091 *m_LogStream << "ERROR: Username and password are needed with -m 7." << endl;
1092 exit(6);
1093 }
1094
1095 // MyNCBI signin
1096 string sUrl = "https://www.ncbi.nlm.nih.gov/portal/signin.cgi?js";
1097 CHttpRequest request = session.NewRequest(sUrl, CHttpSession::ePost);
1098 request.SetRetries(0);
1099
1100 CHttpFormData& data = request.FormData();
1101 data.AddEntry("cmd", "signin");
1102 data.AddEntry("surl", "dummy");
1103 data.AddEntry("furl", "dummy");
1104 data.AddEntry("rrme", "1");
1105 data.AddEntry("uname", m_Username);
1106 data.AddEntry("upasswd", m_Password);
1107
1108 // get authentication cookie
1109 CHttpResponse response = request.Execute();
1110
1111 if (response.GetStatusCode() != 200) {
1112 *m_LogStream << "ERROR: Unable to login to MyNCBI." << endl;
1113 exit(6);
1114 }
1115
1116 // BioSample update
1117 if (m_UseDevServer) {
1118 sUrl = "https://dev-api-int.ncbi.nlm.nih.gov/biosample/update/";
1119 } else {
1120 sUrl = "https://api-int.ncbi.nlm.nih.gov/biosample/update/";
1121 }
1122 string sContentType = "application/json; charset=utf-8";
1123
1124 CHttpCookie m_cookie;
1125 m_cookie.Reset();
1126
1127 // Getting cookies - need WebCubbyUser
1128 ITERATE(CHttpCookies, it, session.Cookies())
1129 {
1130 if ( it->GetName() == "WebCubbyUser")
1131 {
1132 m_cookie = *it;
1133 break;
1134 }
1135 }
1136
1137 // send biosample request
1138 session.Cookies().Add(m_cookie);
1139 response = session.Post(sUrl, sData, sContentType);
1140
1141 if (response.GetStatusCode() != 200) {
1142 NcbiStreamCopy(cout, response.ErrorStream());
1143 cout << endl;
1144 } else {
1145 NcbiStreamCopy(cout, response.ContentStream());
1146 cout << endl;
1147 }
1148
1149 // MyNCBI signout
1150 sUrl = "https://www.ncbi.nlm.nih.gov/account/signout/";
1151 session.Get(sUrl);
1152 }
1153
1154
GetBioseqDiffs(CBioseq_Handle bh)1155 void CBiosampleChkApp::GetBioseqDiffs(CBioseq_Handle bh)
1156 {
1157 vector<string> unprocessed_ids;
1158 biosample_util::TBiosampleFieldDiffList new_diffs =
1159 biosample_util::GetBioseqDiffs(bh,
1160 m_BioSampleAccession,
1161 m_Processed,
1162 unprocessed_ids,
1163 m_UseDevServer,
1164 m_CompareStructuredComments,
1165 m_StructuredCommentPrefix,
1166 &m_cache);
1167 if (! new_diffs.empty()) {
1168 m_Diffs.insert(m_Diffs.end(), new_diffs.begin(), new_diffs.end());
1169 for (const auto &id : unprocessed_ids) {
1170 *m_LogStream << "Failed to retrieve BioSample data for " << id << endl;
1171 }
1172 m_Unprocessed += unprocessed_ids.size();
1173 }
1174 }
1175
1176
PushToRecord(CBioseq_Handle bh)1177 void CBiosampleChkApp::PushToRecord(CBioseq_Handle bh)
1178 {
1179 for (const auto &it : m_Descriptors) {
1180 if (it->IsSource()) {
1181 UpdateBioSource(bh, it->GetSource());
1182 }
1183 }
1184 }
1185
1186
ProcessBioseqForUpdate(CBioseq_Handle bh)1187 void CBiosampleChkApp::ProcessBioseqForUpdate(CBioseq_Handle bh)
1188 {
1189 vector<string> biosample_ids = biosample_util::GetBiosampleIDs(bh);
1190
1191 if (!biosample_util::ResolveSuppliedBioSampleAccession(m_BioSampleAccession, biosample_ids)) {
1192 // error
1193 string label = biosample_util::GetBestBioseqLabel(bh);
1194 *m_LogStream << label << " has conflicting BioSample Accession " << biosample_ids[0] << endl;
1195 return;
1196 }
1197
1198 if (biosample_ids.empty()) {
1199 // for report mode, do not report if no biosample ID
1200 return;
1201 }
1202
1203 for (const auto &id : biosample_ids) {
1204 CRef<CSeq_descr> descr = biosample_util::GetBiosampleData(id, m_UseDevServer, &m_cache);
1205 if (descr) {
1206 m_Descriptors.clear();
1207 copy(descr->Set().begin(), descr->Set().end(),
1208 back_inserter(m_Descriptors));
1209 PushToRecord(bh);
1210 m_Descriptors.clear();
1211 }
1212 }
1213
1214 }
1215
1216
ProcessBioseqHandle(CBioseq_Handle bh)1217 void CBiosampleChkApp::ProcessBioseqHandle(CBioseq_Handle bh)
1218 {
1219 switch (m_Mode) {
1220 case e_report_diffs:
1221 GetBioseqDiffs(bh);
1222 break;
1223 case e_generate_biosample:
1224 try {
1225 biosample_util::PrintBioseqXML(
1226 bh,
1227 m_IDPrefix,
1228 m_ReportStream,
1229 m_BioProjectAccession,
1230 m_Owner,
1231 m_HUPDate,
1232 m_Comment,
1233 m_FirstSeqOnly,
1234 m_CompareStructuredComments,
1235 m_StructuredCommentPrefix);
1236 } catch (CException& e) {
1237 *m_LogStream << e.GetMsg() << endl;
1238 }
1239 break;
1240 case e_push:
1241 PushToRecord(bh);
1242 break;
1243 case e_take_from_biosample:
1244 m_Diffs.clear();
1245 GetBioseqDiffs(bh);
1246 if (biosample_util::DoDiffsContainConflicts(m_Diffs, m_LogStream)) {
1247 m_ReturnCode = 1;
1248 string sequence_id = biosample_util::GetBestBioseqLabel(bh);
1249 *m_LogStream << "Conflicts found for " << sequence_id << endl;
1250 try {
1251 biosample_util::AddBioseqToTable(
1252 bh, *m_Table,
1253 true,
1254 m_CompareStructuredComments,
1255 m_StructuredCommentPrefix);
1256 } catch (CException& e) {
1257 *m_LogStream << e.GetMsg() << endl;
1258 }
1259 } else {
1260 ProcessBioseqForUpdate(bh);
1261 }
1262 break;
1263 case e_take_from_biosample_force:
1264 ProcessBioseqForUpdate(bh);
1265 break;
1266 case e_update_with:
1267 case e_update_no:
1268 GetBioseqDiffs(bh);
1269 break;
1270 default:
1271 if (m_Handler != NULL) {
1272 m_Handler->ProcessBioseq(bh);
1273 }
1274 break;
1275 }
1276
1277 }
1278
1279
ProcessSeqEntry(CRef<CSeq_entry> se)1280 void CBiosampleChkApp::ProcessSeqEntry(CRef<CSeq_entry> se)
1281 {
1282 CRef<CScope> scope = BuildScope();
1283 CSeq_entry_Handle seh = scope->AddTopLevelSeqEntry(*se);
1284 CBioseq_CI bi(seh, CSeq_inst::eMol_na);
1285 while (bi) {
1286 ProcessBioseqHandle(*bi);
1287 if (m_FirstSeqOnly) {
1288 break;
1289 }
1290 ++bi;
1291 }
1292 scope->RemoveTopLevelSeqEntry(seh);
1293 }
1294
1295
ProcessSeqEntry(void)1296 void CBiosampleChkApp::ProcessSeqEntry(void)
1297 {
1298 // Get seq-entry to process
1299 CRef<CSeq_entry> se(ReadSeqEntry());
1300
1301 ProcessSeqEntry(se);
1302
1303 // write out copy after processing, if requested
1304 if (m_AsnOut) {
1305 *m_AsnOut << *se;
1306 }
1307 }
1308
1309
ProcessSet(void)1310 void CBiosampleChkApp::ProcessSet(void)
1311 {
1312 // Get Bioseq-set to process
1313 CRef<CBioseq_set> set(ReadBioseqSet());
1314 if (set && set->IsSetSeq_set()) {
1315 for (const auto &se : set->GetSeq_set()) {
1316 ProcessSeqEntry(se);
1317 }
1318 }
1319
1320 // write out copy after processing, if requested
1321 if (m_AsnOut) {
1322 *m_AsnOut << *set;
1323 }
1324 }
1325
1326
ProcessSeqSubmit(void)1327 void CBiosampleChkApp::ProcessSeqSubmit(void)
1328 {
1329 CRef<CSeq_submit> ss(new CSeq_submit);
1330
1331 // Get seq-submit to process
1332 m_In->Read(ObjectInfo(*ss), CObjectIStream::eNoFileHeader);
1333
1334 m_Owner = "";
1335 // get owner from Seq-submit to use if no pub is found
1336 if (ss->IsSetSub()) {
1337 if (ss->GetSub().IsSetCit()
1338 && ss->GetSub().GetCit().IsSetAuthors()
1339 && ss->GetSub().GetCit().GetAuthors().IsSetAffil()) {
1340 m_Owner = biosample_util::OwnerFromAffil(ss->GetSub().GetCit().GetAuthors().GetAffil());
1341 } else if (ss->GetSub().IsSetContact() && ss->GetSub().GetContact().IsSetContact()
1342 && ss->GetSub().GetContact().GetContact().IsSetAffil()) {
1343 m_Owner = biosample_util::OwnerFromAffil(ss->GetSub().GetContact().GetContact().GetAffil());
1344 }
1345 }
1346
1347 // Process Seq-submit
1348 CRef<CScope> scope = BuildScope();
1349 if (ss->GetData().IsEntrys()) {
1350 for (const auto &se : ss->GetData().GetEntrys()) {
1351 ProcessSeqEntry(se);
1352 }
1353 }
1354 // write out copy after processing, if requested
1355 if (m_AsnOut) {
1356 *m_AsnOut << *ss;
1357 }
1358 }
1359
s_IsEmptyBioSource(const CSeqdesc & src)1360 static bool s_IsEmptyBioSource(const CSeqdesc& src)
1361 {
1362 return !src.GetSource().IsSetSubtype() && !src.GetSource().IsSetGenome() && !src.GetSource().IsSetOrigin() &&
1363 (!src.GetSource().IsSetOrg() || (!src.GetSource().IsSetOrgname() && !src.GetSource().IsSetTaxname() && !src.GetSource().IsSetDivision()));
1364 }
1365
UpdateBioSource(CBioseq_Handle bh,const CBioSource & src)1366 void CBiosampleChkApp::UpdateBioSource (CBioseq_Handle bh, const CBioSource& src)
1367 {
1368 CSeqdesc_CI src_desc_ci(bh, CSeqdesc::e_Source);
1369
1370 CBioseq_EditHandle beh = bh.GetEditHandle();
1371 // Removes empty BioSources
1372 for (; src_desc_ci;) {
1373
1374 if (s_IsEmptyBioSource(*src_desc_ci)) {
1375 const CSeqdesc& cur_descr = *src_desc_ci;
1376 ++src_desc_ci;
1377 beh.RemoveSeqdesc(cur_descr);
1378 }
1379 else {
1380 break;
1381 }
1382 }
1383
1384 if (!src_desc_ci) {
1385 CRef<CSeqdesc> new_desc(new CSeqdesc());
1386 new_desc->SetSource().Assign(src);
1387 CBioseq_set_Handle parent = bh.GetParentBioseq_set();
1388
1389 if (parent && parent.IsSetClass() && parent.GetClass() == CBioseq_set::eClass_nuc_prot) {
1390 CBioseq_set_EditHandle bseh = parent.GetEditHandle();
1391 bseh.AddSeqdesc(*new_desc);
1392 } else {
1393 beh.AddSeqdesc(*new_desc);
1394 }
1395 } else {
1396
1397 const CBioSource& bs = src_desc_ci->GetSource();
1398 CBioSource* old_src = const_cast<CBioSource *> (&bs);
1399 old_src->UpdateWithBioSample(src, true, true);
1400
1401 // Removes the rest of empty BioSources
1402 for (++src_desc_ci; src_desc_ci;) {
1403
1404 if (s_IsEmptyBioSource(*src_desc_ci)) {
1405 const CSeqdesc& cur_descr = *src_desc_ci;
1406 ++src_desc_ci;
1407 beh.RemoveSeqdesc(cur_descr);
1408 }
1409 else {
1410 ++src_desc_ci;
1411 }
1412 }
1413 }
1414 }
1415
1416
Setup(const CArgs & args)1417 void CBiosampleChkApp::Setup(const CArgs& args)
1418 {
1419 // Setup application registry and logs for CONNECT library
1420 CORE_SetLOG(LOG_cxx2c());
1421 CORE_SetREG(REG_cxx2c(&GetConfig(), false));
1422 // Setup MT-safety for CONNECT library
1423 // CORE_SetLOCK(MT_LOCK_cxx2c());
1424
1425 // Create object manager
1426 m_ObjMgr = CObjectManager::GetInstance();
1427 }
1428
1429
OpenFile(const CArgs & args)1430 auto_ptr<CObjectIStream> CBiosampleChkApp::OpenFile(const CArgs& args)
1431 {
1432 string fname = args["i"].AsString();
1433 return CBiosampleChkApp::OpenFile(fname);
1434 }
1435
OpenFile(const string & fname)1436 auto_ptr<CObjectIStream> CBiosampleChkApp::OpenFile(const string &fname)
1437 {
1438 ESerialDataFormat format = eSerial_AsnText;
1439
1440 auto_ptr<CNcbiIstream> hold_stream(new CNcbiIfstream (fname.c_str(), ios::binary));
1441 CNcbiIstream* InputStream = hold_stream.get();
1442
1443 CFormatGuess::EFormat formatGuess = CFormatGuess::Format(*InputStream);
1444
1445 CCompressStream::EMethod method;
1446 switch (formatGuess)
1447 {
1448 case CFormatGuess::eGZip: method = CCompressStream::eGZipFile; break;
1449 case CFormatGuess::eBZip2: method = CCompressStream::eBZip2; break;
1450 case CFormatGuess::eLzo: method = CCompressStream::eLZO; break;
1451 default: method = CCompressStream::eNone; break;
1452 }
1453 if (method != CCompressStream::eNone)
1454 {
1455 CDecompressIStream* decompress(new CDecompressIStream(*InputStream, method, CCompressStream::fDefault, eTakeOwnership));
1456 hold_stream.release();
1457 hold_stream.reset(decompress);
1458 InputStream = hold_stream.get();
1459 formatGuess = CFormatGuess::Format(*InputStream);
1460 }
1461
1462 auto_ptr<CObjectIStream> objectStream;
1463 switch (formatGuess)
1464 {
1465 case CFormatGuess::eBinaryASN:
1466 format = eSerial_AsnBinary;
1467 case CFormatGuess::eTextASN:
1468 format = eSerial_AsnText;
1469 objectStream.reset(CObjectIStream::Open(format, *InputStream, eTakeOwnership));
1470 hold_stream.release();
1471 break;
1472 default:
1473 break;
1474 }
1475 return objectStream;
1476 }
1477
SaveFile(const string & fname,bool useBinaryOutputFormat)1478 void CBiosampleChkApp::SaveFile(const string &fname, bool useBinaryOutputFormat)
1479 {
1480 ios::openmode mode = ios::out;
1481 m_AsnOut = new CNcbiOfstream(fname.c_str(), mode);
1482 if (!m_AsnOut)
1483 {
1484 NCBI_THROW(CException, eUnknown, "Unable to open " + fname);
1485 }
1486 if ( useBinaryOutputFormat ) {
1487 *m_AsnOut << MSerial_AsnBinary;
1488 } else {
1489 *m_AsnOut << MSerial_AsnText;
1490 }
1491 }
1492
1493
1494 /////////////////////////////////////////////////////////////////////////////
1495 // MAIN
1496
1497
main(int argc,const char * argv[])1498 int main(int argc, const char* argv[])
1499 {
1500 return CBiosampleChkApp().AppMain(argc, argv, 0, eDS_Default, 0);
1501 }
1502