1 /* ftanet.cpp
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: ftanet.cpp
28 *
29 * Author: Sergey Bazhin
30 *
31 * File Description:
32 * -----------------
33 * Functions for real working with the servers and network.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39
40 #include <objtools/data_loaders/genbank/gbloader.hpp>
41
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/taxon1/taxon1.hpp>
44 #include <objects/seqset/Bioseq_set.hpp>
45 #include <objects/seq/Seq_descr.hpp>
46 #include <objects/seq/Pubdesc.hpp>
47 #include <objects/pub/Pub_equiv.hpp>
48 #include <objects/pub/Pub.hpp>
49 #include <objects/biblio/Cit_gen.hpp>
50 #include <objects/biblio/Cit_art.hpp>
51 #include <objects/biblio/Cit_sub.hpp>
52 #include <objects/biblio/Cit_book.hpp>
53 #include <objects/biblio/Cit_let.hpp>
54 #include <objects/biblio/Cit_pat.hpp>
55 #include <objects/biblio/Cit_jour.hpp>
56 #include <objects/biblio/Auth_list.hpp>
57 #include <objects/biblio/Affil.hpp>
58 #include <objects/biblio/Author.hpp>
59 #include <objects/biblio/Imprint.hpp>
60 #include <objects/seq/Seq_annot.hpp>
61 #include <objects/pub/Pub_set.hpp>
62 #include <objects/biblio/ArticleIdSet.hpp>
63 #include <objects/biblio/ArticleId.hpp>
64 #include <objmgr/util/sequence.hpp>
65 #include <objtools/edit/pub_fix.hpp>
66
67 #include <dbapi/driver/drivers.hpp>
68
69 #include "index.h"
70
71 #include <objtools/flatfile/flatdefn.h>
72 #include <objtools/flatfile/flatfile_parser.hpp>
73 #include <corelib/ncbi_message.hpp>
74
75 #include "ftaerr.hpp"
76 #include "asci_blk.h"
77 #include "ftamed.h"
78 #include "utilfun.h"
79 #include "ref.h"
80 #include "flatfile_message_reporter.hpp"
81
82 #ifdef THIS_FILE
83 # undef THIS_FILE
84 #endif
85 #define THIS_FILE "ftanet.cpp"
86
87 #define HEALTHY_ACC "U12345"
88
89 BEGIN_NCBI_SCOPE
90 USING_SCOPE(objects);
91
92 static KwordBlk PubStatus[] = {
93 {"Publication Status: Available-Online prior to print", 51},
94 {"Publication Status : Available-Online prior to print", 52},
95 {"Publication_Status: Available-Online prior to print", 51},
96 {"Publication_Status : Available-Online prior to print", 52},
97 {"Publication-Status: Available-Online prior to print", 51},
98 {"Publication-Status : Available-Online prior to print", 52},
99 {"Publication Status: Online-Only", 31},
100 {"Publication Status : Online-Only", 32},
101 {"Publication_Status: Online-Only", 31},
102 {"Publication_Status : Online-Only", 32},
103 {"Publication-Status: Online-Only", 31},
104 {"Publication-Status : Online-Only", 32},
105 {"Publication Status: Available-Online", 36},
106 {"Publication Status : Available-Online", 37},
107 {"Publication_Status: Available-Online", 36},
108 {"Publication_Status : Available-Online", 37},
109 {"Publication-Status: Available-Online", 36},
110 {"Publication-Status : Available-Online", 37},
111 {NULL, 0}
112 };
113
114 /**********************************************************/
fta_strip_pub_comment(char * comment,KwordBlkPtr kbp)115 static char* fta_strip_pub_comment(char* comment, KwordBlkPtr kbp)
116 {
117 char* p;
118 char* q;
119
120 ShrinkSpaces(comment);
121 for(; kbp->str != NULL; kbp++)
122 {
123 for(;;)
124 {
125 p = StringIStr(comment, kbp->str);
126 if(p == NULL)
127 break;
128 for(q = p + kbp->len; *q == ' ' || *q == ';';)
129 q++;
130 fta_StringCpy(p, q);
131 }
132 }
133
134 ShrinkSpaces(comment);
135 p = (*comment == '\0') ? NULL : StringSave(comment);
136 MemFree(comment);
137
138 if(p != NULL && (StringNICmp(p, "Publication Status", 18) == 0 ||
139 StringNICmp(p, "Publication_Status", 18) == 0 ||
140 StringNICmp(p, "Publication-Status", 18) == 0))
141 ErrPostEx(SEV_WARNING, ERR_REFERENCE_UnusualPubStatus,
142 "An unusual Publication Status comment exists for this record: \"%s\". If it is a new variant of the special comments used to indicate ahead-of-print or online-only articles, then the comment must be added to the appropriate table of the parser.",
143 p);
144
145 return(p);
146 }
147
148 /**********************************************************/
fta_fix_last_initials(objects::CName_std & namestd,bool initials)149 static void fta_fix_last_initials(objects::CName_std &namestd,
150 bool initials)
151 {
152 char *str;
153 char *p;
154
155 if(initials)
156 {
157 if(!namestd.IsSetInitials())
158 return;
159 str = (char *) namestd.GetInitials().c_str();
160 }
161 else
162 {
163 if(!namestd.IsSetLast())
164 return;
165 str = (char *) namestd.GetLast().c_str();
166 }
167
168 size_t i = strlen(str);
169 if(i > 5)
170 {
171 p = &str[i-5];
172 if((*p == ' ' || *p == '.') && !strcmp(p + 1, "III."))
173 {
174 namestd.SetSuffix("III");
175 if(*p == '.')
176 p++;
177 *p = '\0';
178 if(initials)
179 namestd.SetInitials(str);
180 else
181 namestd.SetLast(str);
182 i = 0;
183 }
184 }
185 if(i > 4)
186 {
187 p = &str[i-4];
188 if((*p == ' ' || *p == '.') &&
189 (!strcmp(p + 1, "III") || !strcmp(p + 1, "2nd") ||
190 !strcmp(p + 1, "Jr.") || !strcmp(p + 1, "IV.")))
191 {
192 if(!strcmp(p + 1, "III"))
193 namestd.SetSuffix("III");
194 else if(!strcmp(p + 1, "2nd"))
195 namestd.SetSuffix("II");
196 else if(!strcmp(p + 1, "Jr."))
197 namestd.SetSuffix("Jr.");
198 else
199 namestd.SetSuffix("IV");
200 if(*p == '.')
201 p++;
202 *p = '\0';
203 if(initials)
204 namestd.SetInitials(str);
205 else
206 namestd.SetLast(str);
207 i = 0;
208 }
209 }
210 if(i > 3)
211 {
212 p = &str[i-3];
213 if((*p == ' ' || *p == '.') &&
214 (!strcmp(p + 1, "Jr") || !strcmp(p + 1, "IV") ||
215 !strcmp(p + 1, "II")))
216 {
217 if(!strcmp(p + 1, "Jr"))
218 namestd.SetSuffix("Jr.");
219 else if(!strcmp(p + 1, "IV"))
220 namestd.SetSuffix("IV");
221 else
222 namestd.SetSuffix("II");
223 if(*p == '.')
224 p++;
225 *p = '\0';
226 if(initials)
227 namestd.SetInitials(str);
228 else
229 namestd.SetLast(str);
230 i = 0;
231 }
232 }
233 }
234
235 /**********************************************************/
fta_fix_affil(TPubList & pub_list,Parser::ESource source)236 static void fta_fix_affil(TPubList &pub_list, Parser::ESource source)
237 {
238 bool got_pmid = false;
239
240 NON_CONST_ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
241 {
242 if(!(*pub)->IsPmid())
243 continue;
244 got_pmid = true;
245 break;
246 }
247
248 NON_CONST_ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
249 {
250 objects::CAuth_list *authors;
251 if((*pub)->IsArticle())
252 {
253 objects::CCit_art &art = (*pub)->SetArticle();
254 if(!art.IsSetAuthors() || !art.CanGetAuthors())
255 continue;
256
257 authors = &art.SetAuthors();
258 }
259 else if((*pub)->IsSub())
260 {
261 objects::CCit_sub &sub = (*pub)->SetSub();
262 if(!sub.IsSetAuthors() || !sub.CanGetAuthors())
263 continue;
264
265 authors = &sub.SetAuthors();
266 }
267 else if((*pub)->IsGen())
268 {
269 objects::CCit_gen &gen = (*pub)->SetGen();
270 if(!gen.IsSetAuthors() || !gen.CanGetAuthors())
271 continue;
272
273 authors = &gen.SetAuthors();
274 }
275 else if((*pub)->IsBook())
276 {
277 objects::CCit_book &book = (*pub)->SetBook();
278 if(!book.IsSetAuthors() || !book.CanGetAuthors())
279 continue;
280
281 authors = &book.SetAuthors();
282 }
283 else if((*pub)->IsMan())
284 {
285 objects::CCit_let &man = (*pub)->SetMan();
286 if(!man.IsSetCit() || !man.CanGetCit())
287 continue;
288
289 objects::CCit_book &book = man.SetCit();
290 if(!book.IsSetAuthors() || !book.CanGetAuthors())
291 continue;
292
293 authors = &book.SetAuthors();
294 }
295 else if((*pub)->IsPatent())
296 {
297 objects::CCit_pat &pat = (*pub)->SetPatent();
298 if(!pat.IsSetAuthors() || !pat.CanGetAuthors())
299 continue;
300
301 authors = &pat.SetAuthors();
302 }
303 else
304 continue;
305
306
307 if(authors->IsSetAffil() && authors->CanGetAffil() &&
308 authors->GetAffil().Which() == objects::CAffil::e_Str)
309 {
310 objects::CAffil &affil = authors->SetAffil();
311 char *aff = (char *) affil.GetStr().c_str();
312 ShrinkSpaces(aff);
313 affil.SetStr(aff);
314 }
315
316 if(authors->IsSetNames() && authors->CanGetNames() &&
317 authors->GetNames().Which() == objects::CAuth_list::TNames::e_Std)
318 {
319 objects::CAuth_list::TNames &names = authors->SetNames();
320 objects::CAuth_list::TNames::TStd::iterator it = (names.SetStd()).begin();
321 objects::CAuth_list::TNames::TStd::iterator it_end = (names.SetStd()).end();
322 for(; it != it_end; it++)
323 {
324 if((*it)->IsSetAffil() && (*it)->CanGetAffil() &&
325 (*it)->GetAffil().Which() == objects::CAffil::e_Str)
326 {
327 objects::CAffil &affil = (*it)->SetAffil();
328 char *aff = (char *) affil.GetStr().c_str();
329 ShrinkSpaces(aff);
330 affil.SetStr(aff);
331 }
332 if((*it)->IsSetName() && (*it)->CanGetName() &&
333 (*it)->GetName().IsName())
334 {
335 objects::CName_std &namestd = (*it)->SetName().SetName();
336 /* bsv: commented out single letter first name population*/
337 if(source != Parser::ESource::SPROT && source != Parser::ESource::PIR &&
338 !got_pmid)
339 {
340 if(!namestd.IsSetFirst() && namestd.IsSetInitials())
341 {
342 char *str = (char *) namestd.GetInitials().c_str();
343 if((strlen(str) == 1 || strlen(str) == 2) &&
344 (str[1] == '.' || str[1] == '\0'))
345 {
346 char *p = (char*) MemNew(2);
347 p[0] = str[0];
348 p[1] = '\0';
349 namestd.SetFirst(p);
350 MemFree(p);
351 }
352 }
353 if((*pub)->IsArticle())
354 {
355 objects::CCit_art &art1 = (*pub)->SetArticle();
356 if(art1.IsSetAuthors() && art1.CanGetAuthors())
357 {
358 objects::CAuth_list *authors1;
359 authors1 = &art1.SetAuthors();
360 if(authors1->IsSetNames() &&
361 authors1->CanGetNames() &&
362 authors1->GetNames().Which() == objects::CAuth_list::TNames::e_Std)
363 {
364 objects::CAuth_list::TNames &names1 = authors1->SetNames();
365 objects::CAuth_list::TNames::TStd::iterator it1 = (names1.SetStd()).begin();
366 objects::CAuth_list::TNames::TStd::iterator it1_end = (names1.SetStd()).end();
367 for(; it1 != it1_end; it1++)
368 {
369 if((*it1)->IsSetName() &&
370 (*it1)->CanGetName() &&
371 (*it1)->GetName().IsName())
372 {
373 objects::CName_std &namestd1 = (*it1)->SetName().SetName();
374 if(!namestd1.IsSetFirst() &&
375 namestd1.IsSetInitials())
376 {
377 char *str = (char *) namestd1.GetInitials().c_str();
378 if((strlen(str) == 1 || strlen(str) == 2) &&
379 (str[1] == '.' || str[1] == '\0'))
380 {
381 char *p = (char*) MemNew(2);
382 p[0] = str[0];
383 p[1] = '\0';
384 namestd1.SetFirst(p);
385 MemFree(p);
386 }
387 }
388 }
389 }
390 }
391 }
392 }
393 }
394 /**/
395
396 if(namestd.IsSetSuffix())
397 continue;
398 fta_fix_last_initials(namestd, true);
399 if(!namestd.IsSetSuffix())
400 fta_fix_last_initials(namestd, false);
401 }
402 }
403 }
404 }
405 }
406
407 /**********************************************************/
fta_fix_imprint_language(TPubList & pub_list)408 static void fta_fix_imprint_language(TPubList &pub_list)
409 {
410 NON_CONST_ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
411 {
412 if(!(*pub)->IsArticle())
413 continue;
414
415 objects::CCit_art &art = (*pub)->SetArticle();
416 if(!art.IsSetFrom() || !art.GetFrom().IsJournal())
417 continue;
418
419 objects::CCit_jour &journal = art.SetFrom().SetJournal();
420
421 if(journal.IsSetImp() && journal.GetImp().IsSetLanguage())
422 {
423 string language = journal.GetImp().GetLanguage();
424 char *p;
425 char *lang = (char *) language.c_str();
426 for(p = lang; *p != '\0'; p++)
427 if(*p >= 'A' && *p <= 'Z')
428 *p |= 040;
429 journal.SetImp().SetLanguage(lang);
430 }
431 }
432 }
433
434 /**********************************************************/
fta_strip_er_remarks(objects::CPubdesc & pub_descr)435 static void fta_strip_er_remarks(objects::CPubdesc& pub_descr)
436 {
437 if (!pub_descr.IsSetComment())
438 return;
439
440 ITERATE(objects::CPub_equiv::Tdata, pub, pub_descr.GetPub().Get())
441 {
442 if (!(*pub)->IsArticle())
443 continue;
444
445 const objects::CCit_art& art = (*pub)->GetArticle();
446 if (!art.IsSetFrom() || !art.GetFrom().IsJournal())
447 continue;
448
449 const objects::CCit_jour& journal = art.GetFrom().GetJournal();
450
451 int status = 0;
452 if (journal.IsSetImp() && journal.GetImp().IsSetPubstatus())
453 status = journal.GetImp().GetPubstatus();
454
455 if (status == 3 || /* epublish */
456 status == 4 || /* ppublish */
457 status == 10) /* aheadofprint */
458 {
459 char* comment = StringSave(pub_descr.GetComment().c_str());
460 comment = fta_strip_pub_comment(comment, PubStatus);
461 if (comment != NULL && comment[0] != 0)
462 pub_descr.SetComment(comment);
463 else
464 pub_descr.ResetComment();
465
466 MemFree(comment);
467 }
468 }
469 }
470
471 /**********************************************************/
fta_init_med_server(void)472 static Uint1 fta_init_med_server(void)
473 {
474 if(!MedArchInit())
475 return(2);
476 return(1);
477
478 }
479
480 /**********************************************************/
fta_init_tax_server(void)481 static Uint1 fta_init_tax_server(void)
482 {
483 objects::CTaxon1 taxon_srv;
484 if (!taxon_srv.Init())
485 return(2);
486 return(1);
487 }
488
489 /**********************************************************/
fta_init_servers(ParserPtr pp)490 void fta_init_servers(ParserPtr pp)
491 {
492 if(pp->taxserver != 0)
493 {
494 pp->taxserver = fta_init_tax_server();
495 if(pp->taxserver == 2)
496 {
497 ErrPostEx(SEV_WARNING, ERR_SERVER_Failed,
498 "TaxArchInit call failed.");
499 }
500 }
501 else
502 {
503 ErrPostEx(SEV_WARNING, ERR_SERVER_NoTaxLookup,
504 "No taxonomy lookup will be performed.");
505 }
506
507 if(pp->medserver != 0)
508 {
509 pp->medserver = fta_init_med_server();
510 if(pp->medserver == 2)
511 {
512 ErrPostEx(SEV_ERROR, ERR_SERVER_Failed,
513 "MedArchInit call failed.");
514 }
515 }
516 else
517 {
518 ErrPostEx(SEV_WARNING, ERR_SERVER_NoPubMedLookup,
519 "No medline lookup will be performed.");
520 }
521 }
522
523 /**********************************************************/
fta_fini_servers(ParserPtr pp)524 void fta_fini_servers(ParserPtr pp)
525 {
526 if (pp->medserver == 1)
527 MedArchFini();
528 /* if(pp->taxserver == 1)
529 tax1_fini();*/
530 }
531
532 #if 0 // RW-707
533 //std::shared_ptr<CPubseqAccess> s_pubseq;
534
535 /**********************************************************/
536 static Uint1 fta_init_pubseq(void)
537 {
538 // C Toolkit's accpubseq.h library gets username/password from
539 // the environment.
540 // We are now using C++ Toolkit's cpubseq.hpp library which require
541 // credentials during the construction of CPubseqAccess. So read
542 // the environment here and pass it along to the constructor.
543
544 DBAPI_RegisterDriver_FTDS();
545 // DBAPI_RegisterDriver_CTLIB();
546
547 char* env_val = getenv("ALTER_OPEN_SERVER");
548 string idserver = env_val ? env_val : "";
549
550 env_val = getenv("ALTER_USER_NAME");
551 string idusername = env_val ? env_val : "";
552
553 env_val = getenv("ALTER_USER_PASSWORD");
554 string idpassword = env_val ? env_val : "";
555
556 s_pubseq.reset(new CPubseqAccess(idserver.empty() ? "PUBSEQ_OS_INTERNAL_GI64" : idserver.c_str(),
557 idusername.empty() ? "anyone" : idusername.c_str(),
558 idpassword.empty() ? "allowed" : idpassword.c_str()));
559
560 if (s_pubseq == nullptr || !s_pubseq->CheckConnection())
561 return(2);
562 return(1);
563 return 2;
564 }
565
566 /**********************************************************/
567 void fta_entrez_fetch_enable(ParserPtr pp)
568 {
569 return; // RW-707
570
571 if(pp->entrez_fetch != 0)
572 {
573 pp->entrez_fetch = fta_init_pubseq();
574 if(pp->entrez_fetch == 2)
575 {
576 ErrPostEx(SEV_WARNING, ERR_SERVER_Failed,
577 "Failed to connect to PUBSEQ OS.");
578 }
579 }
580 else
581 {
582 ErrPostEx(SEV_WARNING, ERR_SERVER_NotUsed,
583 "No PUBSEQ Bioseq fetch will be performed.");
584 }
585 }
586
587 /**********************************************************/
588 void fta_entrez_fetch_disable(ParserPtr pp)
589 {
590 if(pp->entrez_fetch == 1)
591 s_pubseq.reset();
592 }
593 #endif
594
595 /**********************************************************/
fta_fill_find_pub_option(ParserPtr pp,bool htag,bool rtag)596 void fta_fill_find_pub_option(ParserPtr pp, bool htag, bool rtag)
597 {
598 pp->fpo.always_look = !htag;
599 pp->fpo.replace_cit = !rtag;
600 pp->fpo.merge_ids = true;
601 }
602
603
604 class CFindPub {
605
606 public:
CFindPub(Parser * pp)607 CFindPub(Parser* pp) :
608 m_pParser(pp),
609 m_pPubFixListener(new CPubFixMessageListener()) {
610 if (m_pParser) {
611 const auto& findPubOptions = m_pParser->fpo;
612 m_pPubFix.reset(new edit::CPubFix(
613 findPubOptions.always_look,
614 findPubOptions.replace_cit,
615 findPubOptions.merge_ids,
616 m_pPubFixListener.get()));
617 }
618 }
619
620 using TEntryList = list<CRef<CSeq_entry>>;
621 void Apply(TEntryList& entries);
622 private:
623 void fix_pub_equiv(CPub_equiv& pub_equiv, Parser* pp, bool er);
624 void fix_pub_annot(CPub& pub, Parser* pp, bool er);
625 void find_pub(Parser* pp, list<CRef<CSeq_annot>>& annots, CSeq_descr& descrs);
626
627 Parser* m_pParser;
628 unique_ptr<CPubFixMessageListener> m_pPubFixListener;
629 unique_ptr<edit::CPubFix> m_pPubFix = nullptr;
630 };
631
632
633
634 /**********************************************************/
fta_check_pub_ids(TPubList & pub_list)635 static void fta_check_pub_ids(TPubList& pub_list)
636 {
637 bool found = false;
638 ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
639 {
640 if ((*pub)->IsArticle())
641 {
642 found = true;
643 break;
644 }
645 }
646
647 if (found)
648 return;
649
650 for (objects::CPub_equiv::Tdata::iterator pub = pub_list.begin(); pub != pub_list.end();)
651 {
652 if (!(*pub)->IsMuid() && !(*pub)->IsPmid())
653 {
654 ++pub;
655 continue;
656 }
657
658 ErrPostEx(SEV_ERROR, ERR_REFERENCE_ArticleIdDiscarded,
659 "Article identifier was found for an unpublished, direct submission, book or unparsable article reference, and has been discarded : %s %d.",
660 (*pub)->IsMuid() ? "MUID" : "PMID", (*pub)->GetMuid());
661
662 pub = pub_list.erase(pub);
663 }
664 }
665
666
667 /**********************************************************/
fix_pub_equiv(CPub_equiv & pub_equiv,ParserPtr pp,bool er)668 void CFindPub::fix_pub_equiv(CPub_equiv& pub_equiv, ParserPtr pp, bool er)
669 {
670 if (!pp)
671 return;
672
673 IndexblkPtr ibp = pp->entrylist[pp->curindx];
674
675
676 list<CRef<CPub>> cit_arts;
677 for (auto pPub : pub_equiv.Set())
678 {
679 if (!pPub->IsGen()) {
680 continue;
681 }
682 const CCit_gen& cit_gen = pPub->SetGen();
683 if (cit_gen.IsSetCit() &&
684 (StringNCmp(cit_gen.GetCit().c_str(), "(er)", 4) == 0 || er))
685 {
686 cit_arts.push_back(pPub);
687 break;
688 }
689 }
690
691 if (cit_arts.empty())
692 {
693 fta_check_pub_ids(pub_equiv.Set());
694 m_pPubFix->FixPubEquiv(pub_equiv);
695 return;
696 }
697
698 auto cit_gen = cit_arts.front();
699
700 list<CRef<CPub>> others;
701 CRef<CPub> pMuid, pPmid;
702
703 for (auto pPub : pub_equiv.Set())
704 {
705 if (cit_gen == pPub)
706 continue;
707 if (pPub->IsMuid() && !pMuid)
708 pMuid = pPub;
709 else if (pPub->IsPmid() && !pPmid)
710 pPmid = pPub;
711 else if (!pPub->IsArticle())
712 others.push_back(pPub);
713 }
714
715
716
717 TEntrezId oldpmid = pPmid ? pPmid->GetPmid() : ZERO_ENTREZ_ID;
718 TEntrezId oldmuid = pMuid ? pMuid->GetMuid() : ZERO_ENTREZ_ID;
719 TEntrezId muid = ZERO_ENTREZ_ID;
720 TEntrezId pmid = ZERO_ENTREZ_ID;
721
722 CRef<CCit_art> new_cit_art;
723 if(oldpmid > ZERO_ENTREZ_ID)
724 {
725 new_cit_art = FetchPubPmId(ENTREZ_ID_TO(Int4, oldpmid));
726 if (new_cit_art.Empty())
727 {
728 ErrPostEx(SEV_REJECT, ERR_REFERENCE_InvalidPmid,
729 "MedArch failed to find a Cit-art for reference with pmid \"%d\".",
730 oldpmid);
731 ibp->drop = 1;
732 }
733 else
734 {
735 if (new_cit_art->IsSetIds())
736 {
737 for (const auto& pId : new_cit_art->GetIds().Get())
738 {
739 if (pId->IsPubmed()) {
740 pmid = pId->GetPubmed();
741 }
742 else if (pId->IsMedline()) {
743 muid = pId->GetMedline();
744 }
745 }
746 }
747
748 if(pmid == ZERO_ENTREZ_ID)
749 {
750 ErrPostEx(SEV_REJECT, ERR_REFERENCE_CitArtLacksPmid,
751 "Cit-art returned by MedArch lacks pmid identifier in its ArticleIdSet.");
752 ibp->drop = 1;
753 }
754 else if(pmid != oldpmid)
755 {
756 ErrPostEx(SEV_REJECT, ERR_REFERENCE_DifferentPmids,
757 "Pmid \"%d\" used for lookup does not match pmid \"%d\" in the ArticleIdSet of the Cit-art returned by MedArch.",
758 oldpmid, pmid);
759 ibp->drop = 1;
760 }
761 if(muid > ZERO_ENTREZ_ID && oldmuid > ZERO_ENTREZ_ID && muid != oldmuid)
762 {
763 ErrPostEx(SEV_ERROR, ERR_REFERENCE_MuidPmidMissMatch,
764 "Reference has supplied Medline UI \"%d\" but it does not match muid \"%d\" in the Cit-art returned by MedArch.",
765 oldmuid, muid);
766 }
767 }
768 }
769
770 if (new_cit_art.NotEmpty() && !ibp->drop)
771 {
772 cit_arts.clear();
773 CRef<objects::CPub> new_pub(new objects::CPub);
774 new_pub->SetArticle(*new_cit_art);
775 cit_arts.push_back(new_pub);
776
777 if (pmid > ZERO_ENTREZ_ID && !pPmid)
778 {
779 pPmid = Ref(new CPub());
780 pPmid->SetPmid().Set(pmid);
781 }
782
783 if(muid > ZERO_ENTREZ_ID && !pMuid)
784 {
785 pMuid = Ref(new CPub());
786 pMuid->SetMuid(muid);
787 }
788 }
789
790 auto& pub_list = pub_equiv.Set();
791 pub_list = others;
792 if (pPmid) {
793 pub_list.push_back(pPmid);
794 }
795 if (pMuid && muid > ZERO_ENTREZ_ID) {
796 pub_list.push_back(pMuid);
797 }
798 pub_list.splice(pub_list.end(), cit_arts);
799 }
800
801 /**********************************************************/
fix_pub_annot(CPub & pub,ParserPtr pp,bool er)802 void CFindPub::fix_pub_annot(CPub& pub, ParserPtr pp, bool er)
803 {
804 if (pp == NULL)
805 return;
806
807 if (pub.IsEquiv())
808 {
809 fix_pub_equiv(pub.SetEquiv(), pp, er);
810 if(pp->qamode)
811 fta_fix_imprint_language(pub.SetEquiv().Set());
812 fta_fix_affil(pub.SetEquiv().Set(), pp->source);
813 return;
814 }
815
816 m_pPubFix->FixPub(pub);
817 }
818
819
820 /**********************************************************/
find_pub(ParserPtr pp,list<CRef<CSeq_annot>> & annots,CSeq_descr & descrs)821 void CFindPub::find_pub(ParserPtr pp, list<CRef<CSeq_annot>>& annots, CSeq_descr& descrs)
822 {
823 bool er = any_of(begin(descrs.Get()), end(descrs.Get()),
824 [](CRef<CSeqdesc> pDesc) {
825 if (pDesc->IsPub()) {
826 const auto& pubdesc = pDesc->GetPub();
827 return (pubdesc.IsSetComment() &&
828 fta_remark_is_er(pubdesc.GetComment().c_str()));
829 }
830 return false;
831 });
832
833
834 for (auto pDescr : descrs.Set())
835 {
836 if (!pDescr->IsPub())
837 continue;
838
839 CPubdesc& pub_descr = pDescr->SetPub();
840 fix_pub_equiv(pub_descr.SetPub(), pp, er);
841 if(pp->qamode)
842 fta_fix_imprint_language(pub_descr.SetPub().Set());
843 fta_fix_affil(pub_descr.SetPub().Set(), pp->source);
844 fta_strip_er_remarks(pub_descr);
845 }
846
847 for (auto pAnnot : annots)
848 {
849 if (!pAnnot->IsSetData() || !pAnnot->GetData().IsFtable()) /* feature table */
850 continue;
851
852
853 for (auto pFeat : pAnnot->SetData().SetFtable())
854 {
855 if (pFeat->IsSetData() && pFeat->GetData().IsPub()) /* pub feature */
856 {
857 fix_pub_equiv(pFeat->SetData().SetPub().SetPub(), pp, er);
858 if(pp->qamode)
859 fta_fix_imprint_language(pFeat->SetData().SetPub().SetPub().Set());
860 fta_fix_affil(pFeat->SetData().SetPub().SetPub().Set(), pp->source);
861 fta_strip_er_remarks(pFeat->SetData().SetPub());
862 }
863
864 if (!pFeat->IsSetCit()) {
865 continue;
866 }
867
868 for (auto pPub : pFeat->SetCit().SetPub()) {
869 if (pPub) {
870 fix_pub_annot(*pPub, pp, er);
871 }
872 }
873 }
874 }
875 }
876
877 /**********************************************************/
878 //static void fta_find_pub(ParserPtr pp, TEntryList& seq_entries)
Apply(list<CRef<CSeq_entry>> & seq_entries)879 void CFindPub::Apply(list<CRef<CSeq_entry>>& seq_entries)
880 {
881 for (auto pEntry : seq_entries)
882 {
883 for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*pEntry)); bio_set; ++bio_set)
884 {
885 find_pub(m_pParser, bio_set->SetAnnot(), bio_set->SetDescr());
886
887 if (bio_set->GetDescr().Get().empty())
888 bio_set->ResetDescr();
889
890 if (bio_set->SetAnnot().empty())
891 bio_set->ResetAnnot();
892 }
893
894 for (CTypeIterator<objects::CBioseq> bioseq(Begin(*pEntry)); bioseq; ++bioseq)
895 {
896 find_pub(m_pParser, bioseq->SetAnnot(), bioseq->SetDescr());
897
898 if (bioseq->GetDescr().Get().empty())
899 bioseq->ResetDescr();
900
901 if (bioseq->SetAnnot().empty())
902 bioseq->ResetAnnot();
903 }
904 }
905 }
906
907 /**********************************************************/
fta_find_pub_explore(ParserPtr pp,TEntryList & seq_entries)908 void fta_find_pub_explore(ParserPtr pp, TEntryList& seq_entries)
909 {
910 if(pp->medserver == 0)
911 return;
912
913 if(pp->medserver == 2)
914 pp->medserver = fta_init_med_server();
915
916 if (pp->medserver == 1)
917 {
918 CFindPub find_pub(pp);
919 find_pub.Apply(seq_entries);
920 }
921 }
922
923 /**********************************************************/
new_synonym(objects::COrg_ref & org_ref,objects::COrg_ref & tax_org_ref)924 static void new_synonym(objects::COrg_ref& org_ref, objects::COrg_ref& tax_org_ref)
925 {
926 if (!org_ref.CanGetSyn() || !tax_org_ref.CanGetSyn())
927 return;
928
929 ITERATE(objects::COrg_ref::TSyn, org_syn, org_ref.GetSyn())
930 {
931 bool found = false;
932 ITERATE(objects::COrg_ref::TSyn, tax_syn, tax_org_ref.GetSyn())
933 {
934 if (*org_syn == *tax_syn)
935 {
936 found = true;
937 break;
938 }
939 }
940
941 if (!found)
942 {
943 ErrPostEx(SEV_INFO, ERR_ORGANISM_NewSynonym,
944 "New synonym: %s for [%s].",
945 org_syn->c_str(), org_ref.GetTaxname().c_str());
946 }
947 }
948 }
949
950
951 #define TAX_SERVER_TIMEOUT 3
952 static const STimeout s_timeout = { TAX_SERVER_TIMEOUT, 0 };
953
fix_synonyms(objects::CTaxon1 & taxon,objects::COrg_ref & org_ref)954 static void fix_synonyms(objects::CTaxon1& taxon, objects::COrg_ref& org_ref)
955 {
956 bool with_syns = taxon.SetSynonyms(false);
957 if (!with_syns)
958 org_ref.SetSyn().clear();
959 else
960 taxon.SetSynonyms(true);
961 }
962
963 /**********************************************************/
fta_get_orgref_byid(ParserPtr pp,unsigned char * drop,Int4 taxid,bool isoh)964 static CRef<objects::COrg_ref> fta_get_orgref_byid(ParserPtr pp, unsigned char* drop, Int4 taxid, bool isoh)
965 {
966 CConstRef<objects::CTaxon2_data> taxdata;
967
968 objects::CTaxon1 taxon;
969
970 bool connection_failed = false;
971 for (size_t i = 0; i < 3 && taxdata.Empty(); ++i)
972 {
973 if (taxon.Init(&s_timeout))
974 {
975 taxdata = taxon.GetById(TAX_ID_FROM(Int4, taxid));
976 }
977 else
978 {
979 connection_failed = true;
980 break;
981 }
982 }
983
984 CRef<objects::COrg_ref> ret;
985 if (taxdata.Empty())
986 {
987 if (connection_failed)
988 {
989 ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
990 "Taxonomy lookup failed for taxid %d, apparently because the server is down. Cannot generate ASN.1 for this entry.",
991 taxid);
992 *drop = 1;
993 }
994 else
995 {
996 ErrPostEx(SEV_ERROR, ERR_ORGANISM_TaxNameNotFound,
997 "Taxname not found: [taxid %d].", taxid);
998 }
999 return ret;
1000 }
1001
1002 if (taxdata->GetIs_species_level() != 1 && !isoh)
1003 {
1004 ErrPostEx(SEV_WARNING, ERR_ORGANISM_TaxIdNotSpecLevel,
1005 "Taxarch hit is not on species level: [taxid %d].", taxid);
1006 }
1007
1008 ret.Reset(new objects::COrg_ref);
1009 ret->Assign(taxdata->GetOrg());
1010 fix_synonyms(taxon, *ret);
1011
1012 if (ret->IsSetSyn() && ret->GetSyn().empty())
1013 ret->ResetSyn();
1014
1015 return ret;
1016 }
1017
1018 /**********************************************************/
fta_fix_orgref_byid(ParserPtr pp,Int4 taxid,unsigned char * drop,bool isoh)1019 CRef<objects::COrg_ref> fta_fix_orgref_byid(ParserPtr pp, Int4 taxid, unsigned char* drop, bool isoh)
1020 {
1021 CRef<objects::COrg_ref> ret;
1022
1023 if(taxid < 1 && pp->taxserver == 0)
1024 return ret;
1025
1026 if(pp->taxserver == 2)
1027 pp->taxserver = fta_init_tax_server();
1028
1029 if(pp->taxserver == 2)
1030 {
1031 ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
1032 "Taxonomy lookup failed for taxid %d, because the server is down. Cannot generate ASN.1 for this entry.",
1033 taxid);
1034 *drop = 1;
1035 return ret;
1036 }
1037
1038 ret = fta_get_orgref_byid(pp, drop, taxid, isoh);
1039 if (ret.NotEmpty())
1040 {
1041 ErrPostEx(SEV_INFO, ERR_SERVER_TaxNameWasFound,
1042 "Taxname _was_ found for taxid %d", taxid);
1043 }
1044
1045 return ret;
1046 }
1047
1048 /**********************************************************/
fta_replace_org(ParserPtr pp,unsigned char * drop,objects::COrg_ref & org_ref,const Char * pn,int merge,Int4 attempt)1049 static CRef<objects::COrg_ref> fta_replace_org(ParserPtr pp, unsigned char* drop, objects::COrg_ref& org_ref,
1050 const Char* pn, int merge, Int4 attempt)
1051 {
1052 IndexblkPtr ibp = pp->entrylist[pp->curindx];
1053
1054 CConstRef<objects::CTaxon2_data> taxdata;
1055
1056 objects::CTaxon1 taxon;
1057
1058 bool connection_failed = true;
1059 for (size_t i = 0; i < 3 && taxdata.Empty(); ++i)
1060 {
1061 if (taxon.Init(&s_timeout))
1062 {
1063 if (merge)
1064 {
1065 taxdata = taxon.LookupMerge(org_ref);
1066 }
1067 else
1068 taxdata = taxon.Lookup(org_ref);
1069 connection_failed = false;
1070 break;
1071 }
1072 else
1073 taxon.Fini();
1074 }
1075
1076 CRef<objects::COrg_ref> ret;
1077
1078 if (taxdata.Empty())
1079 {
1080 if(attempt == 1)
1081 return ret;
1082
1083 if (connection_failed)
1084 {
1085 ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
1086 "Taxonomy lookup failed for \"%s\", apparently because the server is down. Cannot generate ASN.1 for this entry.",
1087 pn);
1088 *drop = 1;
1089 }
1090 else if(taxon.GetTaxIdByOrgRef(org_ref) < ZERO_TAX_ID)
1091 {
1092 if((pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL) &&
1093 ibp->is_pat && ibp->taxid > 0 && ibp->organism != NULL)
1094 {
1095 ret = fta_fix_orgref_byid(pp, ibp->taxid, &ibp->drop, true);
1096 if (ret.NotEmpty() && ret->IsSetTaxname() &&
1097 ret->GetTaxname() == ibp->organism)
1098 {
1099 ibp->no_gc_warning = true;
1100 return ret;
1101 }
1102 }
1103 ErrPostEx(SEV_ERROR, ERR_ORGANISM_TaxIdNotUnique,
1104 "Not an unique Taxonomic Id for [%s].", pn);
1105 }
1106 else
1107 {
1108 ErrPostEx(SEV_ERROR, ERR_ORGANISM_TaxNameNotFound,
1109 "Taxon Id not found for [%s].", pn);
1110 }
1111 return ret;
1112 }
1113
1114 if (taxdata->GetIs_species_level() != 1 && (ibp->is_pat == false ||
1115 (pp->source != Parser::ESource::EMBL && pp->source != Parser::ESource::DDBJ)))
1116 {
1117 ErrPostEx(SEV_WARNING, ERR_ORGANISM_TaxIdNotSpecLevel,
1118 "Taxarch hit is not on species level for [%s].", pn);
1119 }
1120
1121 ret.Reset(new objects::COrg_ref);
1122
1123 if (merge)
1124 ret->Assign(org_ref);
1125 else
1126 ret->Assign(taxdata->GetOrg());
1127
1128 return ret;
1129 }
1130
1131 /**********************************************************/
fta_fix_orgref(ParserPtr pp,objects::COrg_ref & org_ref,unsigned char * drop,char * organelle)1132 void fta_fix_orgref(ParserPtr pp, objects::COrg_ref& org_ref, unsigned char* drop,
1133 char* organelle)
1134 {
1135 Int4 attempt;
1136 int merge;
1137
1138 if (org_ref.IsSetTaxname())
1139 {
1140 std::string taxname = org_ref.GetTaxname();
1141
1142 size_t last_char = taxname.size();
1143 for (; last_char; --last_char)
1144 {
1145 if (!isspace(taxname[last_char]))
1146 break;
1147 }
1148
1149 if (!isspace(taxname[last_char]))
1150 ++last_char;
1151 org_ref.SetTaxname(taxname.substr(0, last_char));
1152 }
1153
1154 if(pp->taxserver == 0)
1155 return;
1156
1157 if(pp->taxserver == 2)
1158 pp->taxserver = fta_init_tax_server();
1159
1160 std::string old_taxname;
1161 if (organelle != NULL)
1162 {
1163 std::string taxname = org_ref.IsSetTaxname() ? org_ref.GetTaxname() : "",
1164 organelle_str(organelle),
1165 space(taxname.size() ? " " : "");
1166
1167 old_taxname = taxname;
1168 taxname = organelle_str + space + taxname;
1169 org_ref.SetTaxname(taxname);
1170 attempt = 1;
1171 }
1172 else
1173 {
1174 attempt = 2;
1175 }
1176
1177 std::string taxname = org_ref.IsSetTaxname() ? org_ref.GetTaxname() : "";
1178 if (pp->taxserver == 2)
1179 {
1180 ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
1181 "Taxonomy lookup failed for \"%s\", because the server is down. Cannot generate ASN.1 for this entry.",
1182 taxname.c_str());
1183 *drop = 1;
1184 }
1185 else
1186 {
1187 merge = (pp->format == Parser::EFormat::PIR) ? 0 : 1;
1188
1189 CRef<objects::COrg_ref> new_org_ref = fta_replace_org(pp, drop, org_ref, taxname.c_str(), merge, attempt);
1190 if (new_org_ref.Empty() && attempt == 1)
1191 {
1192 org_ref.SetTaxname(old_taxname);
1193 old_taxname.clear();
1194 new_org_ref = fta_replace_org(pp, drop, org_ref, "", merge, 2);
1195 }
1196
1197 if (new_org_ref.NotEmpty())
1198 {
1199 ErrPostEx(SEV_INFO, ERR_SERVER_TaxNameWasFound,
1200 "Taxon Id _was_ found for [%s]", taxname.c_str());
1201 if(pp->format == Parser::EFormat::PIR)
1202 new_synonym(org_ref, *new_org_ref);
1203
1204 org_ref.Assign(*new_org_ref);
1205 }
1206 }
1207
1208 if (org_ref.IsSetSyn() && org_ref.GetSyn().empty())
1209 org_ref.ResetSyn();
1210 }
1211
1212 /**********************************************************/
fta_get_gi_for_seq_id(const objects::CSeq_id & id)1213 static TGi fta_get_gi_for_seq_id(const objects::CSeq_id& id)
1214 {
1215 TGi gi = objects::sequence::GetGiForId(id, GetScope());
1216 if(gi > ZERO_GI)
1217 return(gi);
1218
1219
1220 objects::CSeq_id test_id;
1221 test_id.SetGenbank().SetAccession(HEALTHY_ACC);
1222
1223 int i = 0;
1224 for (; i < 5; i++)
1225 {
1226 if (objects::sequence::GetGiForId(test_id, GetScope()) > ZERO_GI)
1227 break;
1228 SleepSec(3);
1229 }
1230
1231 if(i == 5)
1232 return GI_CONST(-1);
1233
1234 gi = objects::sequence::GetGiForId(id, GetScope());
1235 if (gi > ZERO_GI)
1236 return(gi);
1237
1238 return ZERO_GI;
1239 }
1240
1241 /**********************************************************
1242 * returns -1 if couldn't get division;
1243 * 1 if it's CON;
1244 * 0 if it's not CON.
1245 */
fta_is_con_div(ParserPtr pp,const objects::CSeq_id & id,const Char * acc)1246 Int4 fta_is_con_div(ParserPtr pp, const objects::CSeq_id& id, const Char* acc)
1247 {
1248 if(pp->entrez_fetch == 0)
1249 return(-1);
1250 //if (pp->entrez_fetch == 2)
1251 // pp->entrez_fetch = fta_init_pubseq();
1252 if(pp->entrez_fetch == 2)
1253 {
1254 ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
1255 "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
1256 acc);
1257 pp->entrylist[pp->curindx]->drop = 1;
1258 return(-1);
1259 }
1260
1261 TGi gi = fta_get_gi_for_seq_id(id);
1262 if(gi < ZERO_GI)
1263 {
1264 ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
1265 "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
1266 acc);
1267 pp->entrylist[pp->curindx]->drop = 1;
1268 return(-1);
1269 }
1270
1271 if (gi == ZERO_GI)
1272 return(0);
1273 #if 0 // RW-707
1274 CPubseqAccess::IdGiClass id_gi;
1275 CPubseqAccess::IdBlobClass id_blob;
1276
1277 if (!s_pubseq->GetIdGiClass(gi, id_gi) || !s_pubseq->GetIdBlobClass(id_gi, id_blob) ||
1278 id_blob.div[0] == '\0')
1279 {
1280 ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
1281 "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
1282 acc);
1283 pp->entrylist[pp->curindx]->drop = 1;
1284 return(-1);
1285 }
1286 if (NStr::EqualNocase(id_blob.div, "CON"))
1287 return(1);
1288 #endif
1289 return(0);
1290 }
1291
1292 /**********************************************************/
fta_citart_by_pmid(Int4 pmid,bool & done)1293 CRef<objects::CCit_art> fta_citart_by_pmid(Int4 pmid, bool& done)
1294 {
1295 CRef<objects::CCit_art> cit_art;
1296
1297 done = true;
1298 if (pmid < 0)
1299 return cit_art;
1300
1301 cit_art = FetchPubPmId(pmid);
1302 return cit_art;
1303 }
1304
1305 /**********************************************************/
fta_init_gbdataloader()1306 void fta_init_gbdataloader()
1307 {
1308 objects::CGBDataLoader::RegisterInObjectManager(*objects::CObjectManager::GetInstance());
1309 }
1310
1311 END_NCBI_SCOPE
1312