1 /* ftanet.cpp
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  ftanet.cpp
28  *
29  * Author: Sergey Bazhin
30  *
31  * File Description:
32  * -----------------
33  *      Functions for real working with the servers and network.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <objtools/data_loaders/genbank/gbloader.hpp>
41 
42 #include <objects/seqfeat/Org_ref.hpp>
43 #include <objects/taxon1/taxon1.hpp>
44 #include <objects/seqset/Bioseq_set.hpp>
45 #include <objects/seq/Seq_descr.hpp>
46 #include <objects/seq/Pubdesc.hpp>
47 #include <objects/pub/Pub_equiv.hpp>
48 #include <objects/pub/Pub.hpp>
49 #include <objects/biblio/Cit_gen.hpp>
50 #include <objects/biblio/Cit_art.hpp>
51 #include <objects/biblio/Cit_sub.hpp>
52 #include <objects/biblio/Cit_book.hpp>
53 #include <objects/biblio/Cit_let.hpp>
54 #include <objects/biblio/Cit_pat.hpp>
55 #include <objects/biblio/Cit_jour.hpp>
56 #include <objects/biblio/Auth_list.hpp>
57 #include <objects/biblio/Affil.hpp>
58 #include <objects/biblio/Author.hpp>
59 #include <objects/biblio/Imprint.hpp>
60 #include <objects/seq/Seq_annot.hpp>
61 #include <objects/pub/Pub_set.hpp>
62 #include <objects/biblio/ArticleIdSet.hpp>
63 #include <objects/biblio/ArticleId.hpp>
64 #include <objmgr/util/sequence.hpp>
65 #include <objtools/edit/pub_fix.hpp>
66 
67 #include <dbapi/driver/drivers.hpp>
68 
69 #include "index.h"
70 
71 #include <objtools/flatfile/flatdefn.h>
72 #include <objtools/flatfile/flatfile_parser.hpp>
73 #include <corelib/ncbi_message.hpp>
74 
75 #include "ftaerr.hpp"
76 #include "asci_blk.h"
77 #include "ftamed.h"
78 #include "utilfun.h"
79 #include "ref.h"
80 #include "flatfile_message_reporter.hpp"
81 
82 #ifdef THIS_FILE
83 #    undef THIS_FILE
84 #endif
85 #define THIS_FILE "ftanet.cpp"
86 
87 #define HEALTHY_ACC "U12345"
88 
89 BEGIN_NCBI_SCOPE
90 USING_SCOPE(objects);
91 
92 static KwordBlk PubStatus[] = {
93     {"Publication Status: Available-Online prior to print", 51},
94     {"Publication Status : Available-Online prior to print", 52},
95     {"Publication_Status: Available-Online prior to print", 51},
96     {"Publication_Status : Available-Online prior to print", 52},
97     {"Publication-Status: Available-Online prior to print", 51},
98     {"Publication-Status : Available-Online prior to print", 52},
99     {"Publication Status: Online-Only", 31},
100     {"Publication Status : Online-Only", 32},
101     {"Publication_Status: Online-Only", 31},
102     {"Publication_Status : Online-Only", 32},
103     {"Publication-Status: Online-Only", 31},
104     {"Publication-Status : Online-Only", 32},
105     {"Publication Status: Available-Online", 36},
106     {"Publication Status : Available-Online", 37},
107     {"Publication_Status: Available-Online", 36},
108     {"Publication_Status : Available-Online", 37},
109     {"Publication-Status: Available-Online", 36},
110     {"Publication-Status : Available-Online", 37},
111     {NULL, 0}
112 };
113 
114 /**********************************************************/
fta_strip_pub_comment(char * comment,KwordBlkPtr kbp)115 static char* fta_strip_pub_comment(char* comment, KwordBlkPtr kbp)
116 {
117     char* p;
118     char* q;
119 
120     ShrinkSpaces(comment);
121     for(; kbp->str != NULL; kbp++)
122     {
123         for(;;)
124         {
125             p = StringIStr(comment, kbp->str);
126             if(p == NULL)
127                 break;
128             for(q = p + kbp->len; *q == ' ' || *q == ';';)
129                 q++;
130             fta_StringCpy(p, q);
131         }
132     }
133 
134     ShrinkSpaces(comment);
135     p = (*comment == '\0') ? NULL : StringSave(comment);
136     MemFree(comment);
137 
138     if(p != NULL && (StringNICmp(p, "Publication Status", 18) == 0 ||
139                      StringNICmp(p, "Publication_Status", 18) == 0 ||
140                      StringNICmp(p, "Publication-Status", 18) == 0))
141         ErrPostEx(SEV_WARNING, ERR_REFERENCE_UnusualPubStatus,
142                   "An unusual Publication Status comment exists for this record: \"%s\". If it is a new variant of the special comments used to indicate ahead-of-print or online-only articles, then the comment must be added to the appropriate table of the parser.",
143                   p);
144 
145     return(p);
146 }
147 
148 /**********************************************************/
fta_fix_last_initials(objects::CName_std & namestd,bool initials)149 static void fta_fix_last_initials(objects::CName_std &namestd,
150                                   bool initials)
151 {
152     char *str;
153     char *p;
154 
155     if(initials)
156     {
157         if(!namestd.IsSetInitials())
158             return;
159         str = (char *) namestd.GetInitials().c_str();
160     }
161     else
162     {
163         if(!namestd.IsSetLast())
164             return;
165         str = (char *) namestd.GetLast().c_str();
166     }
167 
168     size_t i = strlen(str);
169     if(i > 5)
170     {
171         p = &str[i-5];
172         if((*p == ' ' || *p == '.') && !strcmp(p + 1, "III."))
173         {
174             namestd.SetSuffix("III");
175             if(*p == '.')
176                 p++;
177             *p = '\0';
178             if(initials)
179                 namestd.SetInitials(str);
180             else
181                 namestd.SetLast(str);
182             i = 0;
183         }
184     }
185     if(i > 4)
186     {
187         p = &str[i-4];
188         if((*p == ' ' || *p == '.') &&
189            (!strcmp(p + 1, "III") || !strcmp(p + 1, "2nd") ||
190             !strcmp(p + 1, "Jr.") || !strcmp(p + 1, "IV.")))
191         {
192             if(!strcmp(p + 1, "III"))
193                 namestd.SetSuffix("III");
194             else if(!strcmp(p + 1, "2nd"))
195                 namestd.SetSuffix("II");
196             else if(!strcmp(p + 1, "Jr."))
197                 namestd.SetSuffix("Jr.");
198             else
199                 namestd.SetSuffix("IV");
200             if(*p == '.')
201                 p++;
202             *p = '\0';
203             if(initials)
204                 namestd.SetInitials(str);
205             else
206                 namestd.SetLast(str);
207             i = 0;
208         }
209     }
210     if(i > 3)
211     {
212         p = &str[i-3];
213         if((*p == ' ' || *p == '.') &&
214            (!strcmp(p + 1, "Jr") || !strcmp(p + 1, "IV") ||
215             !strcmp(p + 1, "II")))
216         {
217             if(!strcmp(p + 1, "Jr"))
218                 namestd.SetSuffix("Jr.");
219             else if(!strcmp(p + 1, "IV"))
220                 namestd.SetSuffix("IV");
221             else
222                 namestd.SetSuffix("II");
223             if(*p == '.')
224                 p++;
225             *p = '\0';
226             if(initials)
227                 namestd.SetInitials(str);
228             else
229                 namestd.SetLast(str);
230             i = 0;
231         }
232     }
233 }
234 
235 /**********************************************************/
fta_fix_affil(TPubList & pub_list,Parser::ESource source)236 static void fta_fix_affil(TPubList &pub_list, Parser::ESource source)
237 {
238     bool got_pmid = false;
239 
240     NON_CONST_ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
241     {
242         if(!(*pub)->IsPmid())
243             continue;
244         got_pmid = true;
245         break;
246     }
247 
248     NON_CONST_ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
249     {
250         objects::CAuth_list *authors;
251         if((*pub)->IsArticle())
252         {
253             objects::CCit_art &art = (*pub)->SetArticle();
254             if(!art.IsSetAuthors() || !art.CanGetAuthors())
255                 continue;
256 
257             authors = &art.SetAuthors();
258         }
259         else if((*pub)->IsSub())
260         {
261             objects::CCit_sub &sub = (*pub)->SetSub();
262             if(!sub.IsSetAuthors() || !sub.CanGetAuthors())
263                 continue;
264 
265             authors = &sub.SetAuthors();
266         }
267         else if((*pub)->IsGen())
268         {
269             objects::CCit_gen &gen = (*pub)->SetGen();
270             if(!gen.IsSetAuthors() || !gen.CanGetAuthors())
271                 continue;
272 
273             authors = &gen.SetAuthors();
274         }
275         else if((*pub)->IsBook())
276         {
277             objects::CCit_book &book = (*pub)->SetBook();
278             if(!book.IsSetAuthors() || !book.CanGetAuthors())
279                 continue;
280 
281             authors = &book.SetAuthors();
282         }
283         else if((*pub)->IsMan())
284         {
285             objects::CCit_let &man = (*pub)->SetMan();
286             if(!man.IsSetCit() || !man.CanGetCit())
287                 continue;
288 
289             objects::CCit_book &book = man.SetCit();
290             if(!book.IsSetAuthors() || !book.CanGetAuthors())
291                 continue;
292 
293             authors = &book.SetAuthors();
294         }
295         else if((*pub)->IsPatent())
296         {
297             objects::CCit_pat &pat = (*pub)->SetPatent();
298             if(!pat.IsSetAuthors() || !pat.CanGetAuthors())
299                 continue;
300 
301             authors = &pat.SetAuthors();
302         }
303         else
304             continue;
305 
306 
307         if(authors->IsSetAffil() && authors->CanGetAffil() &&
308            authors->GetAffil().Which() == objects::CAffil::e_Str)
309         {
310             objects::CAffil &affil = authors->SetAffil();
311             char *aff = (char *) affil.GetStr().c_str();
312             ShrinkSpaces(aff);
313             affil.SetStr(aff);
314         }
315 
316         if(authors->IsSetNames() && authors->CanGetNames() &&
317            authors->GetNames().Which() == objects::CAuth_list::TNames::e_Std)
318         {
319             objects::CAuth_list::TNames &names = authors->SetNames();
320             objects::CAuth_list::TNames::TStd::iterator it = (names.SetStd()).begin();
321             objects::CAuth_list::TNames::TStd::iterator it_end = (names.SetStd()).end();
322             for(; it != it_end; it++)
323             {
324                 if((*it)->IsSetAffil() && (*it)->CanGetAffil() &&
325                    (*it)->GetAffil().Which() == objects::CAffil::e_Str)
326                 {
327                     objects::CAffil &affil = (*it)->SetAffil();
328                     char *aff = (char *) affil.GetStr().c_str();
329                     ShrinkSpaces(aff);
330                     affil.SetStr(aff);
331                 }
332                 if((*it)->IsSetName() && (*it)->CanGetName() &&
333                    (*it)->GetName().IsName())
334                 {
335                     objects::CName_std &namestd = (*it)->SetName().SetName();
336 /* bsv: commented out single letter first name population*/
337                     if(source != Parser::ESource::SPROT && source != Parser::ESource::PIR &&
338                        !got_pmid)
339                     {
340                         if(!namestd.IsSetFirst() && namestd.IsSetInitials())
341                         {
342                             char *str = (char *) namestd.GetInitials().c_str();
343                             if((strlen(str) == 1 || strlen(str) == 2) &&
344                                (str[1] == '.' || str[1] == '\0'))
345                             {
346                                 char *p = (char*) MemNew(2);
347                                 p[0] = str[0];
348                                 p[1] = '\0';
349                                 namestd.SetFirst(p);
350                                 MemFree(p);
351                             }
352                         }
353                         if((*pub)->IsArticle())
354                         {
355                             objects::CCit_art &art1 = (*pub)->SetArticle();
356                             if(art1.IsSetAuthors() && art1.CanGetAuthors())
357                             {
358                                 objects::CAuth_list *authors1;
359                                 authors1 = &art1.SetAuthors();
360                                 if(authors1->IsSetNames() &&
361                                    authors1->CanGetNames() &&
362                                    authors1->GetNames().Which() == objects::CAuth_list::TNames::e_Std)
363                                 {
364                                     objects::CAuth_list::TNames &names1 = authors1->SetNames();
365                                     objects::CAuth_list::TNames::TStd::iterator it1 = (names1.SetStd()).begin();
366                                     objects::CAuth_list::TNames::TStd::iterator it1_end = (names1.SetStd()).end();
367                                     for(; it1 != it1_end; it1++)
368                                     {
369                                         if((*it1)->IsSetName() &&
370                                            (*it1)->CanGetName() &&
371                                            (*it1)->GetName().IsName())
372                                         {
373                                             objects::CName_std &namestd1 = (*it1)->SetName().SetName();
374                                             if(!namestd1.IsSetFirst() &&
375                                                namestd1.IsSetInitials())
376                                             {
377                                                 char *str = (char *) namestd1.GetInitials().c_str();
378                                                 if((strlen(str) == 1 || strlen(str) == 2) &&
379                                                    (str[1] == '.' || str[1] == '\0'))
380                                                 {
381                                                     char *p = (char*) MemNew(2);
382                                                     p[0] = str[0];
383                                                     p[1] = '\0';
384                                                     namestd1.SetFirst(p);
385                                                     MemFree(p);
386                                                 }
387                                             }
388                                         }
389                                     }
390                                 }
391                             }
392                         }
393                     }
394 /**/
395 
396                     if(namestd.IsSetSuffix())
397                         continue;
398                     fta_fix_last_initials(namestd, true);
399                     if(!namestd.IsSetSuffix())
400                         fta_fix_last_initials(namestd, false);
401                 }
402             }
403         }
404     }
405 }
406 
407 /**********************************************************/
fta_fix_imprint_language(TPubList & pub_list)408 static void fta_fix_imprint_language(TPubList &pub_list)
409 {
410     NON_CONST_ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
411     {
412         if(!(*pub)->IsArticle())
413             continue;
414 
415         objects::CCit_art &art = (*pub)->SetArticle();
416         if(!art.IsSetFrom() || !art.GetFrom().IsJournal())
417             continue;
418 
419         objects::CCit_jour &journal = art.SetFrom().SetJournal();
420 
421         if(journal.IsSetImp() && journal.GetImp().IsSetLanguage())
422         {
423             string language = journal.GetImp().GetLanguage();
424             char *p;
425             char *lang = (char *) language.c_str();
426             for(p = lang; *p != '\0'; p++)
427                 if(*p >= 'A' && *p <= 'Z')
428                      *p |= 040;
429            journal.SetImp().SetLanguage(lang);
430       }
431     }
432 }
433 
434 /**********************************************************/
fta_strip_er_remarks(objects::CPubdesc & pub_descr)435 static void fta_strip_er_remarks(objects::CPubdesc& pub_descr)
436 {
437     if (!pub_descr.IsSetComment())
438         return;
439 
440     ITERATE(objects::CPub_equiv::Tdata, pub, pub_descr.GetPub().Get())
441     {
442         if (!(*pub)->IsArticle())
443             continue;
444 
445         const objects::CCit_art& art = (*pub)->GetArticle();
446         if (!art.IsSetFrom() || !art.GetFrom().IsJournal())
447             continue;
448 
449         const objects::CCit_jour& journal = art.GetFrom().GetJournal();
450 
451         int status = 0;
452         if (journal.IsSetImp() && journal.GetImp().IsSetPubstatus())
453             status = journal.GetImp().GetPubstatus();
454 
455         if (status == 3 ||          /* epublish */
456             status == 4 ||          /* ppublish */
457             status == 10)           /* aheadofprint */
458         {
459             char* comment = StringSave(pub_descr.GetComment().c_str());
460             comment = fta_strip_pub_comment(comment, PubStatus);
461             if (comment != NULL && comment[0] != 0)
462                 pub_descr.SetComment(comment);
463             else
464                 pub_descr.ResetComment();
465 
466             MemFree(comment);
467         }
468     }
469 }
470 
471 /**********************************************************/
fta_init_med_server(void)472 static Uint1 fta_init_med_server(void)
473 {
474     if(!MedArchInit())
475         return(2);
476     return(1);
477 
478 }
479 
480 /**********************************************************/
fta_init_tax_server(void)481 static Uint1 fta_init_tax_server(void)
482 {
483     objects::CTaxon1 taxon_srv;
484     if (!taxon_srv.Init())
485         return(2);
486     return(1);
487 }
488 
489 /**********************************************************/
fta_init_servers(ParserPtr pp)490 void fta_init_servers(ParserPtr pp)
491 {
492     if(pp->taxserver != 0)
493     {
494         pp->taxserver = fta_init_tax_server();
495         if(pp->taxserver == 2)
496         {
497             ErrPostEx(SEV_WARNING, ERR_SERVER_Failed,
498                       "TaxArchInit call failed.");
499         }
500     }
501     else
502     {
503         ErrPostEx(SEV_WARNING, ERR_SERVER_NoTaxLookup,
504                   "No taxonomy lookup will be performed.");
505     }
506 
507     if(pp->medserver != 0)
508     {
509         pp->medserver = fta_init_med_server();
510         if(pp->medserver == 2)
511         {
512             ErrPostEx(SEV_ERROR, ERR_SERVER_Failed,
513                       "MedArchInit call failed.");
514         }
515     }
516     else
517     {
518         ErrPostEx(SEV_WARNING, ERR_SERVER_NoPubMedLookup,
519                   "No medline lookup will be performed.");
520     }
521 }
522 
523 /**********************************************************/
fta_fini_servers(ParserPtr pp)524 void fta_fini_servers(ParserPtr pp)
525 {
526     if (pp->medserver == 1)
527         MedArchFini();
528     /*    if(pp->taxserver == 1)
529             tax1_fini();*/
530 }
531 
532 #if 0 // RW-707
533 //std::shared_ptr<CPubseqAccess> s_pubseq;
534 
535 /**********************************************************/
536 static Uint1 fta_init_pubseq(void)
537 {
538     // C Toolkit's accpubseq.h library gets username/password from
539     // the environment.
540     // We are now using C++ Toolkit's cpubseq.hpp library which require
541     // credentials during the construction of CPubseqAccess.  So read
542     // the environment here and pass it along to the constructor.
543 
544     DBAPI_RegisterDriver_FTDS();
545 //    DBAPI_RegisterDriver_CTLIB();
546 
547     char* env_val = getenv("ALTER_OPEN_SERVER");
548     string idserver = env_val ? env_val : "";
549 
550     env_val = getenv("ALTER_USER_NAME");
551     string idusername = env_val ? env_val : "";
552 
553     env_val = getenv("ALTER_USER_PASSWORD");
554     string idpassword = env_val ? env_val : "";
555 
556     s_pubseq.reset(new CPubseqAccess(idserver.empty() ? "PUBSEQ_OS_INTERNAL_GI64" : idserver.c_str(),
557         idusername.empty() ? "anyone" : idusername.c_str(),
558         idpassword.empty() ? "allowed" : idpassword.c_str()));
559 
560     if (s_pubseq == nullptr || !s_pubseq->CheckConnection())
561         return(2);
562     return(1);
563     return 2;
564 }
565 
566 /**********************************************************/
567 void fta_entrez_fetch_enable(ParserPtr pp)
568 {
569     return; // RW-707
570 
571     if(pp->entrez_fetch != 0)
572     {
573         pp->entrez_fetch = fta_init_pubseq();
574         if(pp->entrez_fetch == 2)
575         {
576             ErrPostEx(SEV_WARNING, ERR_SERVER_Failed,
577                       "Failed to connect to PUBSEQ OS.");
578         }
579     }
580     else
581     {
582         ErrPostEx(SEV_WARNING, ERR_SERVER_NotUsed,
583                   "No PUBSEQ Bioseq fetch will be performed.");
584     }
585 }
586 
587 /**********************************************************/
588 void fta_entrez_fetch_disable(ParserPtr pp)
589 {
590     if(pp->entrez_fetch == 1)
591         s_pubseq.reset();
592 }
593 #endif
594 
595 /**********************************************************/
fta_fill_find_pub_option(ParserPtr pp,bool htag,bool rtag)596 void fta_fill_find_pub_option(ParserPtr pp, bool htag, bool rtag)
597 {
598     pp->fpo.always_look = !htag;
599     pp->fpo.replace_cit = !rtag;
600     pp->fpo.merge_ids = true;
601 }
602 
603 
604 class CFindPub {
605 
606 public:
CFindPub(Parser * pp)607     CFindPub(Parser* pp) :
608         m_pParser(pp),
609         m_pPubFixListener(new CPubFixMessageListener()) {
610             if (m_pParser) {
611                 const auto& findPubOptions = m_pParser->fpo;
612                 m_pPubFix.reset(new edit::CPubFix(
613                             findPubOptions.always_look,
614                             findPubOptions.replace_cit,
615                             findPubOptions.merge_ids,
616                             m_pPubFixListener.get()));
617             }
618         }
619 
620     using TEntryList = list<CRef<CSeq_entry>>;
621     void Apply(TEntryList& entries);
622 private:
623     void fix_pub_equiv(CPub_equiv& pub_equiv, Parser* pp, bool er);
624     void fix_pub_annot(CPub& pub, Parser* pp, bool er);
625     void find_pub(Parser* pp, list<CRef<CSeq_annot>>& annots, CSeq_descr& descrs);
626 
627     Parser* m_pParser;
628     unique_ptr<CPubFixMessageListener> m_pPubFixListener;
629     unique_ptr<edit::CPubFix> m_pPubFix = nullptr;
630 };
631 
632 
633 
634 /**********************************************************/
fta_check_pub_ids(TPubList & pub_list)635 static void fta_check_pub_ids(TPubList& pub_list)
636 {
637     bool found = false;
638     ITERATE(objects::CPub_equiv::Tdata, pub, pub_list)
639     {
640         if ((*pub)->IsArticle())
641         {
642             found = true;
643             break;
644         }
645     }
646 
647     if (found)
648         return;
649 
650     for (objects::CPub_equiv::Tdata::iterator pub = pub_list.begin(); pub != pub_list.end();)
651     {
652         if (!(*pub)->IsMuid() && !(*pub)->IsPmid())
653         {
654             ++pub;
655             continue;
656         }
657 
658         ErrPostEx(SEV_ERROR, ERR_REFERENCE_ArticleIdDiscarded,
659                   "Article identifier was found for an unpublished, direct submission, book or unparsable article reference, and has been discarded : %s %d.",
660                   (*pub)->IsMuid() ? "MUID" : "PMID", (*pub)->GetMuid());
661 
662         pub = pub_list.erase(pub);
663     }
664 }
665 
666 
667 /**********************************************************/
fix_pub_equiv(CPub_equiv & pub_equiv,ParserPtr pp,bool er)668 void CFindPub::fix_pub_equiv(CPub_equiv& pub_equiv, ParserPtr pp, bool er)
669 {
670     if (!pp)
671         return;
672 
673     IndexblkPtr ibp = pp->entrylist[pp->curindx];
674 
675 
676     list<CRef<CPub>> cit_arts;
677     for (auto pPub : pub_equiv.Set())
678     {
679         if (!pPub->IsGen()) {
680             continue;
681         }
682         const CCit_gen& cit_gen = pPub->SetGen();
683         if (cit_gen.IsSetCit() &&
684             (StringNCmp(cit_gen.GetCit().c_str(), "(er)", 4) == 0 || er))
685         {
686             cit_arts.push_back(pPub);
687             break;
688         }
689     }
690 
691     if (cit_arts.empty())
692     {
693         fta_check_pub_ids(pub_equiv.Set());
694         m_pPubFix->FixPubEquiv(pub_equiv);
695         return;
696     }
697 
698     auto cit_gen = cit_arts.front();
699 
700     list<CRef<CPub>> others;
701     CRef<CPub> pMuid, pPmid;
702 
703     for (auto pPub : pub_equiv.Set())
704     {
705         if (cit_gen == pPub)
706             continue;
707         if (pPub->IsMuid() && !pMuid)
708             pMuid = pPub;
709         else if (pPub->IsPmid() && !pPmid)
710             pPmid = pPub;
711         else if (!pPub->IsArticle())
712             others.push_back(pPub);
713     }
714 
715 
716 
717     TEntrezId oldpmid = pPmid ? pPmid->GetPmid() : ZERO_ENTREZ_ID;
718     TEntrezId oldmuid = pMuid ? pMuid->GetMuid() : ZERO_ENTREZ_ID;
719     TEntrezId muid = ZERO_ENTREZ_ID;
720     TEntrezId pmid = ZERO_ENTREZ_ID;
721 
722     CRef<CCit_art> new_cit_art;
723     if(oldpmid > ZERO_ENTREZ_ID)
724     {
725         new_cit_art = FetchPubPmId(ENTREZ_ID_TO(Int4, oldpmid));
726         if (new_cit_art.Empty())
727         {
728             ErrPostEx(SEV_REJECT, ERR_REFERENCE_InvalidPmid,
729                       "MedArch failed to find a Cit-art for reference with pmid \"%d\".",
730                       oldpmid);
731             ibp->drop = 1;
732         }
733         else
734         {
735             if (new_cit_art->IsSetIds())
736             {
737                 for (const auto& pId : new_cit_art->GetIds().Get())
738                 {
739                     if (pId->IsPubmed()) {
740                         pmid = pId->GetPubmed();
741                     }
742                     else if (pId->IsMedline()) {
743                         muid = pId->GetMedline();
744                     }
745                 }
746             }
747 
748             if(pmid == ZERO_ENTREZ_ID)
749             {
750                 ErrPostEx(SEV_REJECT, ERR_REFERENCE_CitArtLacksPmid,
751                           "Cit-art returned by MedArch lacks pmid identifier in its ArticleIdSet.");
752                 ibp->drop = 1;
753             }
754             else if(pmid != oldpmid)
755             {
756                 ErrPostEx(SEV_REJECT, ERR_REFERENCE_DifferentPmids,
757                           "Pmid \"%d\" used for lookup does not match pmid \"%d\" in the ArticleIdSet of the Cit-art returned by MedArch.",
758                           oldpmid, pmid);
759                 ibp->drop = 1;
760             }
761             if(muid > ZERO_ENTREZ_ID && oldmuid > ZERO_ENTREZ_ID && muid != oldmuid)
762             {
763                 ErrPostEx(SEV_ERROR, ERR_REFERENCE_MuidPmidMissMatch,
764                           "Reference has supplied Medline UI \"%d\" but it does not match muid \"%d\" in the Cit-art returned by MedArch.",
765                           oldmuid, muid);
766             }
767         }
768     }
769 
770     if (new_cit_art.NotEmpty() && !ibp->drop)
771     {
772         cit_arts.clear();
773         CRef<objects::CPub> new_pub(new objects::CPub);
774         new_pub->SetArticle(*new_cit_art);
775         cit_arts.push_back(new_pub);
776 
777         if (pmid > ZERO_ENTREZ_ID && !pPmid)
778         {
779             pPmid = Ref(new CPub());
780             pPmid->SetPmid().Set(pmid);
781         }
782 
783         if(muid > ZERO_ENTREZ_ID && !pMuid)
784         {
785             pMuid = Ref(new CPub());
786             pMuid->SetMuid(muid);
787         }
788     }
789 
790     auto& pub_list = pub_equiv.Set();
791     pub_list = others;
792     if (pPmid) {
793         pub_list.push_back(pPmid);
794     }
795     if (pMuid && muid > ZERO_ENTREZ_ID) {
796         pub_list.push_back(pMuid);
797     }
798     pub_list.splice(pub_list.end(), cit_arts);
799 }
800 
801 /**********************************************************/
fix_pub_annot(CPub & pub,ParserPtr pp,bool er)802 void CFindPub::fix_pub_annot(CPub& pub, ParserPtr pp, bool er)
803 {
804     if (pp == NULL)
805         return;
806 
807     if (pub.IsEquiv())
808     {
809         fix_pub_equiv(pub.SetEquiv(), pp, er);
810         if(pp->qamode)
811             fta_fix_imprint_language(pub.SetEquiv().Set());
812         fta_fix_affil(pub.SetEquiv().Set(), pp->source);
813         return;
814     }
815 
816     m_pPubFix->FixPub(pub);
817 }
818 
819 
820 /**********************************************************/
find_pub(ParserPtr pp,list<CRef<CSeq_annot>> & annots,CSeq_descr & descrs)821 void CFindPub::find_pub(ParserPtr pp, list<CRef<CSeq_annot>>& annots, CSeq_descr& descrs)
822 {
823     bool er = any_of(begin(descrs.Get()), end(descrs.Get()),
824             [](CRef<CSeqdesc> pDesc) {
825                 if (pDesc->IsPub()) {
826                     const auto& pubdesc = pDesc->GetPub();
827                     return (pubdesc.IsSetComment() &&
828                             fta_remark_is_er(pubdesc.GetComment().c_str()));
829                 }
830                 return false;
831             });
832 
833 
834     for (auto pDescr : descrs.Set())
835     {
836         if (!pDescr->IsPub())
837             continue;
838 
839         CPubdesc& pub_descr = pDescr->SetPub();
840         fix_pub_equiv(pub_descr.SetPub(), pp, er);
841         if(pp->qamode)
842             fta_fix_imprint_language(pub_descr.SetPub().Set());
843         fta_fix_affil(pub_descr.SetPub().Set(), pp->source);
844         fta_strip_er_remarks(pub_descr);
845     }
846 
847     for (auto pAnnot : annots)
848     {
849         if (!pAnnot->IsSetData() || !pAnnot->GetData().IsFtable())              /* feature table */
850             continue;
851 
852 
853         for (auto pFeat : pAnnot->SetData().SetFtable())
854         {
855             if (pFeat->IsSetData() && pFeat->GetData().IsPub())   /* pub feature */
856             {
857                 fix_pub_equiv(pFeat->SetData().SetPub().SetPub(), pp, er);
858                 if(pp->qamode)
859                     fta_fix_imprint_language(pFeat->SetData().SetPub().SetPub().Set());
860                 fta_fix_affil(pFeat->SetData().SetPub().SetPub().Set(), pp->source);
861                 fta_strip_er_remarks(pFeat->SetData().SetPub());
862             }
863 
864             if (!pFeat->IsSetCit()) {
865                 continue;
866             }
867 
868             for (auto pPub : pFeat->SetCit().SetPub()) {
869                 if (pPub) {
870                     fix_pub_annot(*pPub, pp, er);
871                 }
872             }
873         }
874     }
875 }
876 
877 /**********************************************************/
878 //static void fta_find_pub(ParserPtr pp, TEntryList& seq_entries)
Apply(list<CRef<CSeq_entry>> & seq_entries)879 void CFindPub::Apply(list<CRef<CSeq_entry>>& seq_entries)
880 {
881     for (auto pEntry : seq_entries)
882     {
883         for (CTypeIterator<objects::CBioseq_set> bio_set(Begin(*pEntry)); bio_set; ++bio_set)
884         {
885             find_pub(m_pParser, bio_set->SetAnnot(), bio_set->SetDescr());
886 
887             if (bio_set->GetDescr().Get().empty())
888                 bio_set->ResetDescr();
889 
890             if (bio_set->SetAnnot().empty())
891                 bio_set->ResetAnnot();
892         }
893 
894         for (CTypeIterator<objects::CBioseq> bioseq(Begin(*pEntry)); bioseq; ++bioseq)
895         {
896             find_pub(m_pParser, bioseq->SetAnnot(), bioseq->SetDescr());
897 
898             if (bioseq->GetDescr().Get().empty())
899                 bioseq->ResetDescr();
900 
901             if (bioseq->SetAnnot().empty())
902                 bioseq->ResetAnnot();
903         }
904     }
905 }
906 
907 /**********************************************************/
fta_find_pub_explore(ParserPtr pp,TEntryList & seq_entries)908 void fta_find_pub_explore(ParserPtr pp, TEntryList& seq_entries)
909 {
910     if(pp->medserver == 0)
911         return;
912 
913     if(pp->medserver == 2)
914         pp->medserver = fta_init_med_server();
915 
916     if (pp->medserver == 1)
917     {
918         CFindPub find_pub(pp);
919         find_pub.Apply(seq_entries);
920     }
921 }
922 
923 /**********************************************************/
new_synonym(objects::COrg_ref & org_ref,objects::COrg_ref & tax_org_ref)924 static void new_synonym(objects::COrg_ref& org_ref, objects::COrg_ref& tax_org_ref)
925 {
926     if (!org_ref.CanGetSyn() || !tax_org_ref.CanGetSyn())
927         return;
928 
929     ITERATE(objects::COrg_ref::TSyn, org_syn, org_ref.GetSyn())
930     {
931         bool found = false;
932         ITERATE(objects::COrg_ref::TSyn, tax_syn, tax_org_ref.GetSyn())
933         {
934             if (*org_syn == *tax_syn)
935             {
936                 found = true;
937                 break;
938             }
939         }
940 
941         if (!found)
942         {
943             ErrPostEx(SEV_INFO, ERR_ORGANISM_NewSynonym,
944                       "New synonym: %s for [%s].",
945                       org_syn->c_str(), org_ref.GetTaxname().c_str());
946         }
947     }
948 }
949 
950 
951 #define TAX_SERVER_TIMEOUT 3
952 static const STimeout s_timeout = { TAX_SERVER_TIMEOUT, 0 };
953 
fix_synonyms(objects::CTaxon1 & taxon,objects::COrg_ref & org_ref)954 static void fix_synonyms(objects::CTaxon1& taxon, objects::COrg_ref& org_ref)
955 {
956     bool with_syns = taxon.SetSynonyms(false);
957     if (!with_syns)
958         org_ref.SetSyn().clear();
959     else
960         taxon.SetSynonyms(true);
961 }
962 
963 /**********************************************************/
fta_get_orgref_byid(ParserPtr pp,unsigned char * drop,Int4 taxid,bool isoh)964 static CRef<objects::COrg_ref> fta_get_orgref_byid(ParserPtr pp, unsigned char* drop, Int4 taxid, bool isoh)
965 {
966     CConstRef<objects::CTaxon2_data> taxdata;
967 
968     objects::CTaxon1 taxon;
969 
970     bool connection_failed = false;
971     for (size_t i = 0; i < 3 && taxdata.Empty(); ++i)
972     {
973         if (taxon.Init(&s_timeout))
974         {
975             taxdata = taxon.GetById(TAX_ID_FROM(Int4, taxid));
976         }
977         else
978         {
979             connection_failed = true;
980             break;
981         }
982     }
983 
984     CRef<objects::COrg_ref> ret;
985     if (taxdata.Empty())
986     {
987         if (connection_failed)
988         {
989             ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
990                       "Taxonomy lookup failed for taxid %d, apparently because the server is down. Cannot generate ASN.1 for this entry.",
991                       taxid);
992             *drop = 1;
993         }
994         else
995         {
996             ErrPostEx(SEV_ERROR, ERR_ORGANISM_TaxNameNotFound,
997                       "Taxname not found: [taxid %d].", taxid);
998         }
999         return ret;
1000     }
1001 
1002     if (taxdata->GetIs_species_level() != 1 && !isoh)
1003     {
1004         ErrPostEx(SEV_WARNING, ERR_ORGANISM_TaxIdNotSpecLevel,
1005                   "Taxarch hit is not on species level: [taxid %d].", taxid);
1006     }
1007 
1008     ret.Reset(new objects::COrg_ref);
1009     ret->Assign(taxdata->GetOrg());
1010     fix_synonyms(taxon, *ret);
1011 
1012     if (ret->IsSetSyn() && ret->GetSyn().empty())
1013         ret->ResetSyn();
1014 
1015     return ret;
1016 }
1017 
1018 /**********************************************************/
fta_fix_orgref_byid(ParserPtr pp,Int4 taxid,unsigned char * drop,bool isoh)1019 CRef<objects::COrg_ref> fta_fix_orgref_byid(ParserPtr pp, Int4 taxid, unsigned char* drop, bool isoh)
1020 {
1021     CRef<objects::COrg_ref> ret;
1022 
1023     if(taxid < 1 && pp->taxserver == 0)
1024         return ret;
1025 
1026     if(pp->taxserver == 2)
1027         pp->taxserver = fta_init_tax_server();
1028 
1029     if(pp->taxserver == 2)
1030     {
1031         ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
1032                   "Taxonomy lookup failed for taxid %d, because the server is down. Cannot generate ASN.1 for this entry.",
1033                   taxid);
1034         *drop = 1;
1035         return ret;
1036     }
1037 
1038     ret = fta_get_orgref_byid(pp, drop, taxid, isoh);
1039     if (ret.NotEmpty())
1040     {
1041         ErrPostEx(SEV_INFO, ERR_SERVER_TaxNameWasFound,
1042                   "Taxname _was_ found for taxid %d", taxid);
1043     }
1044 
1045     return ret;
1046 }
1047 
1048 /**********************************************************/
fta_replace_org(ParserPtr pp,unsigned char * drop,objects::COrg_ref & org_ref,const Char * pn,int merge,Int4 attempt)1049 static CRef<objects::COrg_ref> fta_replace_org(ParserPtr pp, unsigned char* drop, objects::COrg_ref& org_ref,
1050                                                            const Char* pn, int merge, Int4 attempt)
1051 {
1052     IndexblkPtr ibp = pp->entrylist[pp->curindx];
1053 
1054     CConstRef<objects::CTaxon2_data> taxdata;
1055 
1056     objects::CTaxon1 taxon;
1057 
1058     bool connection_failed = true;
1059     for (size_t i = 0; i < 3 && taxdata.Empty(); ++i)
1060     {
1061         if (taxon.Init(&s_timeout))
1062         {
1063             if (merge)
1064             {
1065                 taxdata = taxon.LookupMerge(org_ref);
1066             }
1067             else
1068                 taxdata = taxon.Lookup(org_ref);
1069             connection_failed = false;
1070             break;
1071         }
1072         else
1073             taxon.Fini();
1074     }
1075 
1076     CRef<objects::COrg_ref> ret;
1077 
1078     if (taxdata.Empty())
1079     {
1080         if(attempt == 1)
1081             return ret;
1082 
1083         if (connection_failed)
1084         {
1085             ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
1086                       "Taxonomy lookup failed for \"%s\", apparently because the server is down. Cannot generate ASN.1 for this entry.",
1087                       pn);
1088             *drop = 1;
1089         }
1090         else if(taxon.GetTaxIdByOrgRef(org_ref) < ZERO_TAX_ID)
1091         {
1092             if((pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL) &&
1093                ibp->is_pat && ibp->taxid > 0 && ibp->organism != NULL)
1094             {
1095                 ret = fta_fix_orgref_byid(pp, ibp->taxid, &ibp->drop, true);
1096                 if (ret.NotEmpty() && ret->IsSetTaxname() &&
1097                    ret->GetTaxname() == ibp->organism)
1098                 {
1099                     ibp->no_gc_warning = true;
1100                     return ret;
1101                 }
1102             }
1103             ErrPostEx(SEV_ERROR, ERR_ORGANISM_TaxIdNotUnique,
1104                       "Not an unique Taxonomic Id for [%s].", pn);
1105         }
1106         else
1107         {
1108             ErrPostEx(SEV_ERROR, ERR_ORGANISM_TaxNameNotFound,
1109                       "Taxon Id not found for [%s].", pn);
1110         }
1111         return ret;
1112     }
1113 
1114     if (taxdata->GetIs_species_level() != 1 && (ibp->is_pat == false ||
1115        (pp->source != Parser::ESource::EMBL && pp->source != Parser::ESource::DDBJ)))
1116     {
1117         ErrPostEx(SEV_WARNING, ERR_ORGANISM_TaxIdNotSpecLevel,
1118                   "Taxarch hit is not on species level for [%s].", pn);
1119     }
1120 
1121     ret.Reset(new objects::COrg_ref);
1122 
1123     if (merge)
1124         ret->Assign(org_ref);
1125     else
1126         ret->Assign(taxdata->GetOrg());
1127 
1128     return ret;
1129 }
1130 
1131 /**********************************************************/
fta_fix_orgref(ParserPtr pp,objects::COrg_ref & org_ref,unsigned char * drop,char * organelle)1132 void fta_fix_orgref(ParserPtr pp, objects::COrg_ref& org_ref, unsigned char* drop,
1133                     char* organelle)
1134 {
1135     Int4      attempt;
1136     int       merge;
1137 
1138     if (org_ref.IsSetTaxname())
1139     {
1140         std::string taxname = org_ref.GetTaxname();
1141 
1142         size_t last_char = taxname.size();
1143         for (; last_char; --last_char)
1144         {
1145             if (!isspace(taxname[last_char]))
1146                 break;
1147         }
1148 
1149         if (!isspace(taxname[last_char]))
1150             ++last_char;
1151         org_ref.SetTaxname(taxname.substr(0, last_char));
1152     }
1153 
1154     if(pp->taxserver == 0)
1155         return;
1156 
1157     if(pp->taxserver == 2)
1158         pp->taxserver = fta_init_tax_server();
1159 
1160     std::string old_taxname;
1161     if (organelle != NULL)
1162     {
1163         std::string taxname = org_ref.IsSetTaxname() ? org_ref.GetTaxname() : "",
1164                     organelle_str(organelle),
1165                     space(taxname.size() ? " " : "");
1166 
1167         old_taxname = taxname;
1168         taxname = organelle_str + space + taxname;
1169         org_ref.SetTaxname(taxname);
1170         attempt = 1;
1171     }
1172     else
1173     {
1174         attempt = 2;
1175     }
1176 
1177     std::string taxname = org_ref.IsSetTaxname() ? org_ref.GetTaxname() : "";
1178     if (pp->taxserver == 2)
1179     {
1180         ErrPostEx(SEV_FATAL, ERR_SERVER_TaxServerDown,
1181                   "Taxonomy lookup failed for \"%s\", because the server is down. Cannot generate ASN.1 for this entry.",
1182                   taxname.c_str());
1183         *drop = 1;
1184     }
1185     else
1186     {
1187         merge = (pp->format == Parser::EFormat::PIR) ? 0 : 1;
1188 
1189         CRef<objects::COrg_ref> new_org_ref = fta_replace_org(pp, drop, org_ref, taxname.c_str(), merge, attempt);
1190         if (new_org_ref.Empty() && attempt == 1)
1191         {
1192             org_ref.SetTaxname(old_taxname);
1193             old_taxname.clear();
1194             new_org_ref = fta_replace_org(pp, drop, org_ref, "", merge, 2);
1195         }
1196 
1197         if (new_org_ref.NotEmpty())
1198         {
1199             ErrPostEx(SEV_INFO, ERR_SERVER_TaxNameWasFound,
1200                       "Taxon Id _was_ found for [%s]", taxname.c_str());
1201             if(pp->format == Parser::EFormat::PIR)
1202                 new_synonym(org_ref, *new_org_ref);
1203 
1204             org_ref.Assign(*new_org_ref);
1205         }
1206     }
1207 
1208     if (org_ref.IsSetSyn() && org_ref.GetSyn().empty())
1209         org_ref.ResetSyn();
1210 }
1211 
1212 /**********************************************************/
fta_get_gi_for_seq_id(const objects::CSeq_id & id)1213 static TGi fta_get_gi_for_seq_id(const objects::CSeq_id& id)
1214 {
1215     TGi gi = objects::sequence::GetGiForId(id, GetScope());
1216     if(gi > ZERO_GI)
1217         return(gi);
1218 
1219 
1220     objects::CSeq_id test_id;
1221     test_id.SetGenbank().SetAccession(HEALTHY_ACC);
1222 
1223     int i = 0;
1224     for (; i < 5; i++)
1225     {
1226         if (objects::sequence::GetGiForId(test_id, GetScope()) > ZERO_GI)
1227             break;
1228         SleepSec(3);
1229     }
1230 
1231     if(i == 5)
1232         return GI_CONST(-1);
1233 
1234     gi = objects::sequence::GetGiForId(id, GetScope());
1235     if (gi > ZERO_GI)
1236         return(gi);
1237 
1238     return ZERO_GI;
1239 }
1240 
1241 /**********************************************************
1242  * returns -1 if couldn't get division;
1243  *          1 if it's CON;
1244  *          0 if it's not CON.
1245  */
fta_is_con_div(ParserPtr pp,const objects::CSeq_id & id,const Char * acc)1246 Int4 fta_is_con_div(ParserPtr pp, const objects::CSeq_id& id, const Char* acc)
1247 {
1248     if(pp->entrez_fetch == 0)
1249         return(-1);
1250     //if (pp->entrez_fetch == 2)
1251     //    pp->entrez_fetch = fta_init_pubseq();
1252     if(pp->entrez_fetch == 2)
1253     {
1254         ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
1255                   "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
1256                   acc);
1257         pp->entrylist[pp->curindx]->drop = 1;
1258         return(-1);
1259     }
1260 
1261     TGi gi = fta_get_gi_for_seq_id(id);
1262     if(gi < ZERO_GI)
1263     {
1264         ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
1265                   "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
1266                   acc);
1267         pp->entrylist[pp->curindx]->drop = 1;
1268         return(-1);
1269     }
1270 
1271     if (gi == ZERO_GI)
1272         return(0);
1273 #if 0 // RW-707
1274     CPubseqAccess::IdGiClass id_gi;
1275     CPubseqAccess::IdBlobClass id_blob;
1276 
1277     if (!s_pubseq->GetIdGiClass(gi, id_gi) || !s_pubseq->GetIdBlobClass(id_gi, id_blob) ||
1278         id_blob.div[0] == '\0')
1279     {
1280         ErrPostEx(SEV_ERROR, ERR_ACCESSION_CannotGetDivForSecondary,
1281                   "Failed to determine division code for secondary accession \"%s\". Entry dropped.",
1282                   acc);
1283         pp->entrylist[pp->curindx]->drop = 1;
1284         return(-1);
1285     }
1286     if (NStr::EqualNocase(id_blob.div, "CON"))
1287         return(1);
1288 #endif
1289     return(0);
1290 }
1291 
1292 /**********************************************************/
fta_citart_by_pmid(Int4 pmid,bool & done)1293 CRef<objects::CCit_art> fta_citart_by_pmid(Int4 pmid, bool& done)
1294 {
1295     CRef<objects::CCit_art> cit_art;
1296 
1297     done = true;
1298     if (pmid < 0)
1299         return cit_art;
1300 
1301     cit_art = FetchPubPmId(pmid);
1302     return cit_art;
1303 }
1304 
1305 /**********************************************************/
fta_init_gbdataloader()1306 void fta_init_gbdataloader()
1307 {
1308     objects::CGBDataLoader::RegisterInObjectManager(*objects::CObjectManager::GetInstance());
1309 }
1310 
1311 END_NCBI_SCOPE
1312