1 /* $Id: pub_fix.cpp 632623 2021-06-03 17:38:11Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Alexey Dobronadezhdin
27  *
28  * File Description:
29  *   Code for fixing up publications.
30  *   MedArch lookup and post-processing utilities.
31  *   Based on medutil.c written by James Ostell.
32  */
33 
34 #include <ncbi_pch.hpp>
35 
36 #include <objects/biblio/ArticleId.hpp>
37 #include <objects/biblio/ArticleIdSet.hpp>
38 #include <objects/biblio/Author.hpp>
39 #include <objects/biblio/Cit_art.hpp>
40 #include <objects/biblio/Cit_book.hpp>
41 #include <objects/biblio/Cit_proc.hpp>
42 #include <objects/biblio/Cit_jour.hpp>
43 #include <objects/biblio/Imprint.hpp>
44 #include <objects/biblio/Title.hpp>
45 #include <objects/general/Name_std.hpp>
46 #include <objects/general/Person_id.hpp>
47 #include <objects/general/Date.hpp>
48 #include <objects/general/Date_std.hpp>
49 #include <objects/general/Dbtag.hpp>
50 #include <objects/medline/Medline_entry.hpp>
51 
52 #include <objects/pub/Pub.hpp>
53 
54 #include <objtools/edit/pub_fix.hpp>
55 
56 #include "pub_fix_aux.hpp"
57 
58 #include <objects/mla/Title_msg.hpp>
59 #include <objects/mla/Title_msg_list.hpp>
60 #include <objects/mla/mla_client.hpp>
61 
62 #include <corelib/ncbi_message.hpp>
63 #include <objtools/eutils/api/esearch.hpp>
64 #include <objtools/eutils/esearch/IdList.hpp>
65 #include <objtools/eutils/api/esummary.hpp>
66 
67 
68 BEGIN_NCBI_SCOPE
69 BEGIN_SCOPE(objects)
70 BEGIN_SCOPE(edit)
71 
72 #define ERR_POST_TO_LISTENER(listener, severity, code, subcode, message) \
73 do { \
74     if (listener) { \
75         ostringstream ostr; \
76         ostr << message; \
77         string text = ostr.str(); \
78         CMessage_Basic msg(text, severity, code, subcode); \
79         listener->PostMessage(msg); \
80     } \
81 } while (false)
82 
83 namespace fix_pub
84 {
85 struct SErrorSubcodes
86 {
87     string m_error_str;
88     map<int, string> m_sub_errors;
89 };
90 
91 static map<int, SErrorSubcodes> ERROR_CODE_STR =
92 {
93     // I'm using it in blob_maint application. The string REFERENCE is not informative, changing to FixPub.
94     { err_Reference,{ "FixPub",
95     {
96         { err_Reference_MuidNotFound, "MuidNotFound" },
97         { err_Reference_SuccessfulMuidLookup, "SuccessfulMuidLookup" },
98         { err_Reference_OldInPress, "OldInPress" },
99         { err_Reference_No_reference, "No_reference" },
100         { err_Reference_Multiple_ref, "Multiple_ref" },
101         { err_Reference_Multiple_muid, "Multiple_muid" },
102         { err_Reference_MedlineMatchIgnored, "MedlineMatchIgnored" },
103         { err_Reference_MuidMissmatch, "MuidMissmatch" },
104         { err_Reference_NoConsortAuthors, "NoConsortAuthors" },
105         { err_Reference_DiffConsortAuthors, "DiffConsortAuthors" },
106         { err_Reference_PmidMissmatch, "PmidMissmatch" },
107         { err_Reference_Multiple_pmid, "Multiple_pmid" },
108         { err_Reference_FailedToGetPub, "FailedToGetPub" },
109         { err_Reference_MedArchMatchIgnored, "MedArchMatchIgnored" },
110         { err_Reference_SuccessfulPmidLookup, "SuccessfulPmidLookup" },
111         { err_Reference_PmidNotFound, "PmidNotFound" },
112         { err_Reference_NoPmidJournalNotInPubMed, "NoPmidJournalNotInPubMed" },
113         { err_Reference_PmidNotFoundInPress, "PmidNotFoundInPress" },
114         { err_Reference_NoPmidJournalNotInPubMedInPress, "NoPmidJournalNotInPubMedInPress" }
115     }
116     } },
117     { err_Print,{ "PRINT",
118     {
119         { err_Print_Failed, "Failed" }
120     }
121     } },
122     { err_AuthList,{ "AuthList",
123     {
124         { err_AuthList_SignificantDrop, "SignificantDrop" },
125         { err_AuthList_PreserveGB, "PreserveGB" },
126         { err_AuthList_LowMatch, "LowMatch" }
127     }
128     } }
129 };
130 }
131 
GetErrorId(int err_code,int err_sub_code)132 string CPubFix::GetErrorId(int err_code, int err_sub_code)
133 {
134     string ret;
135 
136     const auto& err_category = fix_pub::ERROR_CODE_STR.find(err_code);
137     if (err_category != fix_pub::ERROR_CODE_STR.end()) {
138 
139         const auto& error_sub_code_str = err_category->second.m_sub_errors.find(err_sub_code);
140         if (error_sub_code_str != err_category->second.m_sub_errors.end()) {
141             ret = err_category->second.m_error_str;
142             ret += '.';
143             ret += error_sub_code_str->second;
144         }
145     }
146 
147     return ret;
148 }
149 
150 
151 namespace fix_pub
152 {
153 //   MedlineToISO(tmp)
154 //       converts a MEDLINE citation to ISO/GenBank style
155 
MedlineToISO(CCit_art & cit_art)156 void MedlineToISO(CCit_art& cit_art)
157 {
158     if (cit_art.IsSetAuthors()) {
159 
160         CAuth_list& auths = cit_art.SetAuthors();
161         if (auths.IsSetNames()) {
162             if (auths.GetNames().IsMl()) {
163                 auths.ConvertMlToStandard();
164             }
165             else if (auths.GetNames().IsStd()) {
166                 for (auto& auth : auths.SetNames().SetStd()) {
167                     if (auth->IsSetName() && auth->GetName().IsMl()) {
168                         auth = CAuthor::ConvertMlToStandard(*auth);
169                     }
170                 }
171             }
172         }
173     }
174 
175     if (!cit_art.IsSetFrom() || !cit_art.GetFrom().IsJournal())
176         return;
177 
178     // from a journal - get iso_jta
179     CCit_jour& journal = cit_art.SetFrom().SetJournal();
180 
181     auto IsIso_jta = [](const CRef<CTitle::C_E>& title) -> bool { return title->IsIso_jta(); };
182 
183     if (journal.IsSetTitle() && journal.GetTitle().IsSet()) {
184 
185         auto& titles = journal.SetTitle().Set();
186 
187         if (find_if(titles.begin(), titles.end(), IsIso_jta) == titles.end()) {
188             // no iso_jta
189 
190             CTitle::C_E& first_title = *titles.front();
191             const string& title_str = journal.SetTitle().GetTitle(first_title);
192 
193             CRef<CTitle> title_new(new CTitle);
194             CRef<CTitle::C_E> type_new(new CTitle::C_E);
195             type_new->SetIso_jta(title_str);
196             title_new->Set().push_back(type_new);
197 
198             CRef<CTitle_msg> msg_new(new CTitle_msg);
199             msg_new->SetType(eTitle_type_iso_jta);
200             msg_new->SetTitle(*title_new);
201 
202             CRef<CTitle_msg_list> msg_list_new;
203             try {
204                 CMLAClient mla;
205                 msg_list_new = mla.AskGettitle(*msg_new);
206             }
207             catch (exception &) {
208                 // msg_list_new stays empty
209             }
210 
211             if (msg_list_new.NotEmpty() && msg_list_new->IsSetTitles()) {
212 
213                 bool gotit = false;
214                 for (auto& item : msg_list_new->GetTitles()) {
215                     const CTitle &cur_title = item->GetTitle();
216 
217                     if (cur_title.IsSet()) {
218 
219                         auto iso_jta_title = find_if(cur_title.Get().begin(), cur_title.Get().end(), IsIso_jta);
220                         if (iso_jta_title != cur_title.Get().end()) {
221                             gotit = true;
222                             first_title.SetIso_jta((*iso_jta_title)->GetIso_jta());
223                             break;
224                         }
225                     }
226 
227                     if (gotit)
228                         break;
229                 }
230             }
231         }
232     }
233 
234     if (journal.IsSetImp()) {
235         // remove Eng language
236         if (journal.GetImp().IsSetLanguage() && journal.GetImp().GetLanguage() == "Eng")
237             journal.SetImp().ResetLanguage();
238     }
239 }
240 
241 //   SplitMedlineEntry(mep)
242 //      splits a medline entry into 2 pubs (1 muid, 1 Cit-art)
243 //      converts Cit-art to ISO/GenBank style
244 //      deletes original medline entry
SplitMedlineEntry(CPub_equiv::Tdata & medlines)245 void SplitMedlineEntry(CPub_equiv::Tdata& medlines)
246 {
247     if (medlines.size() != 1) {
248         return;
249     }
250 
251     CPub& pub = *medlines.front();
252     CMedline_entry& medline = pub.SetMedline();
253     if (!medline.IsSetCit() && medline.IsSetPmid() && medline.GetPmid() < ZERO_ENTREZ_ID) {
254         return;
255     }
256 
257     CRef<CPub> pmid;
258     if (medline.GetPmid() > ZERO_ENTREZ_ID) {
259         pmid.Reset(new CPub);
260         pmid->SetPmid(medline.GetPmid());
261     }
262 
263     CRef<CPub> cit_art;
264     if (medline.IsSetCit()) {
265         cit_art.Reset(new CPub);
266         cit_art->SetArticle(medline.SetCit());
267         MedlineToISO(cit_art->SetArticle());
268     }
269 
270     medlines.clear();
271 
272     if (pmid.NotEmpty())
273         medlines.push_back(pmid);
274 
275     if (cit_art.NotEmpty())
276         medlines.push_back(cit_art);
277 }
278 
279 
IsInpress(const CCit_art & cit_art)280 bool IsInpress(const CCit_art& cit_art)
281 {
282     if (!cit_art.IsSetFrom())
283         return false;
284 
285     bool ret = false;
286     if (cit_art.GetFrom().IsJournal()) {
287         const CCit_jour& journal = cit_art.GetFrom().GetJournal();
288         ret = journal.IsSetImp() && journal.GetImp().IsSetPrepub() && journal.GetImp().GetPrepub() == CImprint::ePrepub_in_press;
289     }
290     else if (cit_art.GetFrom().IsBook()) {
291         const CCit_book& book = cit_art.GetFrom().GetBook();
292         ret = book.IsSetImp() && book.GetImp().IsSetPrepub() && book.GetImp().GetPrepub() == CImprint::ePrepub_in_press;
293     }
294     else if (cit_art.GetFrom().IsProc() && cit_art.GetFrom().GetProc().IsSetBook()) {
295         const CCit_book& book = cit_art.GetFrom().GetProc().GetBook();
296         ret = book.IsSetImp() && book.GetImp().IsSetPrepub() && book.GetImp().GetPrepub() == CImprint::ePrepub_in_press;
297     }
298     return ret;
299 }
300 
301 
MULooksLikeISSN(const string & str)302 bool MULooksLikeISSN(const string& str)
303 {
304     // ISSN: nnnn-nnnn or nnnn-nnnX, where n -> '0'-'9', i.e. 0123-5566
305     static const size_t ISSN_SIZE = 9;
306     static const size_t ISSN_DASH_POS = 4;
307     static const size_t ISSN_X_POS = 8;
308 
309     if (NStr::IsBlank(str) || str.size() != ISSN_SIZE || str[ISSN_DASH_POS] != '-') {
310         return false;
311     }
312 
313     for (size_t i = 0; i < ISSN_SIZE; ++i) {
314         char ch = str[i];
315         if (isdigit(ch) || (ch == '-' && i == ISSN_DASH_POS) || (ch == 'X' && i == ISSN_X_POS)) {
316             continue;
317         }
318         return false;
319     }
320 
321     return true;
322 }
323 
324 /*
325 bool MUIsJournalIndexed(const string& journal)
326 {
327     if (journal.empty()) {
328         return false;
329     }
330 
331     string title(journal);
332     NStr::ReplaceInPlace(title, "(", " ");
333     NStr::ReplaceInPlace(title, ")", " ");
334     NStr::ReplaceInPlace(title, ".", " ");
335 
336     title = NStr::Sanitize(title);
337 
338     CEutilsClient eutils;
339 
340     static const int MAX_ITEMS = 200;
341     eutils.SetMaxReturn(MAX_ITEMS);
342 
343     vector<string> ids;
344 
345     static const string EUTILS_DATABASE("nlmcatalog");
346 
347     try {
348         if (MULooksLikeISSN(title)) {
349             eutils.Search(EUTILS_DATABASE, title + "[issn]", ids);
350         }
351 
352         if (ids.empty()) {
353             eutils.Search(EUTILS_DATABASE, title + "[multi] AND ncbijournals[sb]", ids);
354         }
355 
356         if (ids.empty()) {
357             eutils.Search(EUTILS_DATABASE, title + "[jo]", ids);
358         }
359     }
360     catch (CException&) {
361         return false;
362     }
363 
364     if (ids.size() != 1) {
365         return false;
366     }
367 
368 
369     // getting the indexing status of the journal found
370     static const string SUMMARY_VERSION("2.0");
371     xml::document doc;
372     eutils.Summary(EUTILS_DATABASE, ids, doc, SUMMARY_VERSION);
373 
374     const xml::node& root_node = doc.get_root_node();
375     xml::node_set nodes(root_node.run_xpath_query("//DocumentSummarySet/DocumentSummary/CurrentIndexingStatus/text()"));
376 
377     string status;
378     if (nodes.size() == 1) {
379         status = nodes.begin()->get_content();
380     }
381 
382     return status == "Y";
383 }
384 */
385 
s_GetESearchIds(CESearch_Request & req,const string & term,list<string> & ids)386 static void s_GetESearchIds(CESearch_Request& req,
387                             const string& term,
388                             list<string>& ids) {
389     // error handling is modeled on that of CEUtilsClient::x_Search()
390     req.SetArgument("term", term);
391     for (int retry=0; retry<10; ++retry) {
392         try {
393             auto& istr = dynamic_cast<CConn_HttpStream&>(req.GetStream());
394             auto pRes = Ref(new esearch::CESearchResult());
395             istr >> MSerial_Xml >> *pRes;
396 
397             if (istr.GetStatusCode() == 200) {
398                 if (pRes->IsSetData()) {
399                     if (pRes->GetData().IsInfo() &&
400                         pRes->GetData().GetInfo().IsSetContent() &&
401                         pRes->GetData().GetInfo().GetContent().IsSetIdList()) {
402 
403                         const auto& idList = pRes->GetData().GetInfo().GetContent().GetIdList();
404                         if (idList.IsSetId()) {
405                             ids = idList.GetId();
406                         }
407                         req.Disconnect();
408                         return;
409                     }
410                     else
411                     if (pRes->GetData().IsERROR()) {
412                         NCBI_THROW(CException, eUnknown,
413                                 pRes->GetData().GetERROR());
414                     }
415                 } // pRest->IsSetData()
416             } // istr.GetStatusCode() == 200
417         }
418         catch(CException& e) {
419             ERR_POST(Warning << "failed on attempt " << retry + 1
420                     << ": " << e);
421         }
422         req.Disconnect();
423 
424         int sleepSeconds = sqrt(retry);
425         if (sleepSeconds) {
426             SleepSec(sleepSeconds);
427         }
428     } // retry
429 
430     NCBI_THROW(CException, eUnknown,
431             "failed to execute query: " + term);
432 }
433 
434 
s_IsIndexed(CRef<CEUtils_ConnContext> pContext,const string & id)435 static bool s_IsIndexed(CRef<CEUtils_ConnContext> pContext,
436         const string& id) {
437 
438     // error handling is modeled on that of CEUtilsClient::x_Summary()
439     CESummary_Request request("nlmcatalog", pContext);
440     request.GetId().AddId(id);
441     request.SetArgument("version", "2.0");
442     string xmlOutput;
443     bool success=false;
444     for (int retry=0; retry<10; ++retry) {
445         try {
446             auto& istr = dynamic_cast<CConn_HttpStream&>(request.GetStream());
447             NcbiStreamToString(&xmlOutput, istr);
448             if (istr.GetStatusCode() == 200) {
449                 success = true;
450                 break;
451             }
452         }
453         catch (...) {
454         }
455         request.Disconnect();
456 
457         int sleepSeconds = sqrt(retry);
458         if (sleepSeconds) {
459             SleepSec(sleepSeconds);
460         }
461     }
462 
463     if (!success) {
464         NCBI_THROW(CException, eUnknown,
465                 "failed to execute esummary request: " + request.GetQueryString());
466     }
467 
468     static const string indexingElement { "<CurrentIndexingStatus>Y</CurrentIndexingStatus>" };
469     auto firstPos = NStr::Find(xmlOutput, indexingElement, NStr::eNocase);
470     if (firstPos == NPOS) {
471         return false;
472     }
473     auto lastPos = NStr::Find(xmlOutput, indexingElement, NStr::eNocase, NStr::eReverseSearch);
474 
475     return firstPos == lastPos;
476 }
477 
478 
479 
MUIsJournalIndexed(const string & journal)480 bool MUIsJournalIndexed(const string& journal)
481 {
482     if (journal.empty()) {
483         return false;
484     }
485 
486     string title(journal);
487     NStr::ReplaceInPlace(title, "(", " ");
488     NStr::ReplaceInPlace(title, ")", " ");
489     NStr::ReplaceInPlace(title, ".", " ");
490 
491     title = NStr::Sanitize(title);
492 
493     list<string> ids;
494     auto pContext = Ref(new CEUtils_ConnContext());
495     CESearch_Request req("nlmcatalog", pContext);
496     req.SetRetMax(2);
497     req.SetUseHistory(false);
498     try {
499         if (MULooksLikeISSN(title)) {
500             s_GetESearchIds(req, title + "[issn]", ids);
501         }
502 
503         if (ids.empty()) {
504             s_GetESearchIds(req, title + "[multi] AND ncbijournals[sb]", ids);
505         }
506 
507         if (ids.empty()) {
508             s_GetESearchIds(req, title + "[jo]", ids);
509         }
510     }
511     catch (CException&) {
512         return false;
513     }
514 
515     if (ids.size() != 1) {
516         return false;
517     }
518 
519     return s_IsIndexed(pContext, ids.front());
520 }
521 
522 
523 
PrintPub(const CCit_art & cit_art,bool found,bool auth,long muid,IMessageListener * err_log)524 void PrintPub(const CCit_art& cit_art, bool found, bool auth, long muid, IMessageListener* err_log)
525 {
526     string first_name,
527         last_name;
528 
529     if (cit_art.IsSetAuthors() && cit_art.GetAuthors().IsSetNames()) {
530 
531         if (cit_art.GetAuthors().GetNames().IsStd()) {
532 
533             const CAuthor& first_author = *cit_art.GetAuthors().GetNames().GetStd().front();
534 
535             if (first_author.IsSetName()) {
536                 if (first_author.GetName().IsName()) {
537                     const CName_std& namestd = first_author.GetName().GetName();
538                     if (namestd.IsSetLast()) {
539                         last_name = namestd.GetLast();
540                     }
541                     if (namestd.IsSetInitials()) {
542                         first_name = namestd.GetInitials();
543                     }
544                 }
545                 else if (first_author.GetName().IsConsortium()) {
546                     last_name = first_author.GetName().GetConsortium();
547                 }
548             }
549         }
550         else {
551             last_name = cit_art.GetAuthors().GetNames().GetStr().front();
552         }
553     }
554     else {
555         ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Print, err_Print_Failed, "Authors NULL");
556     }
557 
558     const CImprint* imprint = nullptr;
559     const CTitle* title = nullptr;
560 
561     if (cit_art.IsSetFrom()) {
562         if (cit_art.GetFrom().IsJournal()) {
563             const CCit_jour& journal = cit_art.GetFrom().GetJournal();
564 
565             if (journal.IsSetTitle()) {
566                 title = &journal.GetTitle();
567             }
568 
569             if (journal.IsSetImp()) {
570                 imprint = &journal.GetImp();
571             }
572         }
573         else if (cit_art.GetFrom().IsBook()) {
574             const CCit_book& book = cit_art.GetFrom().GetBook();
575 
576             if (book.IsSetTitle()) {
577                 title = &book.GetTitle();
578             }
579 
580             if (book.IsSetImp()) {
581                 imprint = &book.GetImp();
582             }
583         }
584     }
585 
586     static const string UNKNOWN_JOURNAL("journal unknown");
587     string title_str(UNKNOWN_JOURNAL);
588 
589     if (title && title->IsSet() && !title->Get().empty()) {
590 
591         const CTitle::C_E& first_title = *title->Get().front();
592         const string& str = title->GetTitle(first_title);
593 
594         if (!str.empty())
595             title_str = str;
596     }
597 
598 
599     static const string NO_PAGE("no page number");
600     static const string NO_VOL("no volume number");
601 
602     string vol(NO_VOL),
603         page(NO_PAGE);
604 
605     int year = 0;
606     bool in_press = false;
607 
608     if (imprint) {
609 
610         if (imprint->IsSetVolume()) {
611             vol = imprint->GetVolume();
612         }
613 
614         if (imprint->IsSetPages()) {
615             page = imprint->GetPages();
616         }
617 
618         if (imprint->IsSetDate() && imprint->GetDate().IsStd() && imprint->GetDate().GetStd().IsSetYear()) {
619             year = imprint->GetDate().GetStd().GetYear();
620         }
621 
622         in_press = imprint->IsSetPrepub() && imprint->GetPrepub() == CImprint::ePrepub_in_press;
623     }
624 
625     if (auth) {
626         ERR_POST_TO_LISTENER(err_log, eDiag_Error, err_Reference, err_Reference_MedArchMatchIgnored,
627             "Too many author name differences: " << muid << "|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
628         return;
629     }
630 
631     if (in_press) {
632 
633         int cur_year = CDate_std(CTime(CTime::eCurrent)).GetYear();
634         static const int YEAR_MAX_DIFF = 2;
635 
636         if (year && cur_year - year > YEAR_MAX_DIFF) {
637             ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, err_Reference_OldInPress,
638                 "encountered in-press article more than 2 years old: " << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
639         }
640     }
641 
642     if (found) {
643         ERR_POST_TO_LISTENER(err_log, eDiag_Info, err_Reference, err_Reference_SuccessfulPmidLookup,
644             muid << "|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
645     }
646     else if (MUIsJournalIndexed(title_str)) {
647         if (muid) {
648             ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, in_press ? err_Reference_PmidNotFoundInPress : err_Reference_PmidNotFound,
649                 ">>" << muid << "<<|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
650         }
651         else {
652             ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, in_press ? err_Reference_PmidNotFoundInPress : err_Reference_PmidNotFound,
653                 last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
654         }
655     }
656     else {
657         if (muid) {
658             ERR_POST_TO_LISTENER(err_log, eDiag_Info, err_Reference, in_press ? err_Reference_NoPmidJournalNotInPubMedInPress : err_Reference_NoPmidJournalNotInPubMed,
659                 ">>" << muid << "<<|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
660         }
661         else {
662             ERR_POST_TO_LISTENER(err_log, eDiag_Info, err_Reference, in_press ? err_Reference_NoPmidJournalNotInPubMedInPress : err_Reference_NoPmidJournalNotInPubMed,
663                 last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
664         }
665     }
666 }
667 
668 
IsFromBook(const CCit_art & art)669 bool IsFromBook(const CCit_art& art)
670 {
671     return art.IsSetFrom() && art.GetFrom().IsBook();
672 }
673 
674 
675 static const size_t MAX_MATCH_COEFF = 3;
676 
TenAuthorsCompare(CCit_art & cit_old,CCit_art & cit_new)677 bool TenAuthorsCompare(CCit_art& cit_old, CCit_art& cit_new)
678 {
679     _ASSERT(cit_old.IsSetAuthors() && cit_new.IsSetAuthors() &&
680         cit_old.GetAuthors().IsSetNames() && cit_new.GetAuthors().IsSetNames() && "Both arguments should have valid author's names at this point");
681 
682     const CAuth_list::C_Names& old_names = cit_old.GetAuthors().GetNames();
683     const CAuth_list::C_Names& new_names = cit_new.GetAuthors().GetNames();
684 
685     auto StrNotEmpty = [](const string& str) -> bool { return !str.empty();  };
686     size_t new_num_of_authors = count_if(new_names.GetStr().begin(), new_names.GetStr().end(), StrNotEmpty),
687            num_of_authors = count_if(old_names.GetStr().begin(), old_names.GetStr().end(), StrNotEmpty);
688 
689     size_t match = 0;
690     for (auto& name : old_names.GetStr()) {
691 
692         if (!name.empty()) {
693             if (NStr::FindNoCase(new_names.GetStr(), name) != nullptr) {
694                 ++match;
695             }
696         }
697     }
698 
699     size_t min_num_of_authors = min(num_of_authors, new_num_of_authors);
700 
701     if (min_num_of_authors > MAX_MATCH_COEFF * match) {
702         return false;
703     }
704 
705     static const size_t MAX_AUTHORS = 10;
706     if (min_num_of_authors > MAX_AUTHORS) {
707         cit_new.SetAuthors(cit_old.SetAuthors());
708         cit_old.ResetAuthors();
709     }
710 
711     return true;
712 }
713 
ExtractConsortiums(const CAuth_list::C_Names::TStd & names,CAuth_list::C_Names::TStr & extracted)714 size_t ExtractConsortiums(const CAuth_list::C_Names::TStd& names, CAuth_list::C_Names::TStr& extracted)
715 {
716     size_t num_of_names = 0;
717 
718     for (auto& name: names)
719     {
720         const CAuthor& auth = *name;
721         if (auth.IsSetName() && auth.GetName().IsName()) {
722             ++num_of_names;
723         }
724         else if (auth.IsSetName() && auth.GetName().IsConsortium()) {
725 
726             const string& cur_consortium = auth.GetName().GetConsortium();
727             extracted.push_back(cur_consortium);
728         }
729     }
730 
731     extracted.sort([](const string& a, const string& b) { return NStr::CompareNocase(a, b) == -1;  });
732 
733     return num_of_names;
734 }
735 
736 
GetFirstTenNames(const CAuth_list::C_Names::TStd & names,list<CTempString> & res)737 void GetFirstTenNames(const CAuth_list::C_Names::TStd& names, list<CTempString>& res)
738 {
739     static const size_t MAX_EXTRACTED = 10;
740     size_t extracted = 0;
741 
742     for (auto& name : names) {
743         if (name->IsSetName() && name->GetName().IsName() && name->GetName().GetName().IsSetLast()) {
744             res.push_back(name->GetName().GetName().GetLast());
745             ++extracted;
746 
747             if (extracted == MAX_EXTRACTED) {
748                 break;
749             }
750         }
751     }
752 }
753 
754 
TenAuthorsProcess(CCit_art & cit,CCit_art & new_cit,IMessageListener * err_log)755 bool TenAuthorsProcess(CCit_art& cit, CCit_art& new_cit, IMessageListener* err_log)
756 {
757     if (!new_cit.IsSetAuthors() || !new_cit.GetAuthors().IsSetNames()) {
758         if (cit.IsSetAuthors()) {
759             new_cit.SetAuthors(cit.SetAuthors());
760             cit.ResetAuthors();
761         }
762         return true;
763     }
764 
765     if (!cit.IsSetAuthors() || !cit.GetAuthors().IsSetNames() ||
766         cit.GetAuthors().GetNames().Which() != new_cit.GetAuthors().GetNames().Which()) {
767         return true;
768     }
769 
770     if (!cit.GetAuthors().GetNames().IsStd()) {
771         return TenAuthorsCompare(cit, new_cit);
772     }
773 
774     CAuth_list::C_Names::TStr old_consortiums;
775     size_t num_names = ExtractConsortiums(cit.GetAuthors().GetNames().GetStd(), old_consortiums);
776 
777     CAuth_list::C_Names::TStr new_consortiums;
778     size_t new_num_names = ExtractConsortiums(new_cit.GetAuthors().GetNames().GetStd(), new_consortiums);
779 
780     if (!old_consortiums.empty()) {
781 
782         string old_cons_list = NStr::Join(old_consortiums, ";");
783         if (new_consortiums.empty()) {
784 
785             ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, err_Reference_NoConsortAuthors,
786                 "Publication as returned by MedArch lacks consortium authors of the original publication : \"" << old_cons_list << "\".");
787 
788             for_each(old_consortiums.begin(), old_consortiums.end(),
789                 [&new_cit](const string& consortium) {
790 
791                 CRef<CAuthor> auth(new CAuthor);
792                 auth->SetName().SetConsortium(consortium);
793 
794                 new_cit.SetAuthors().SetNames().SetStd().push_front(auth);
795             });
796         }
797         else {
798 
799             string new_cons_list = NStr::Join(new_consortiums, ";");
800             if (!NStr::EqualNocase(old_cons_list, new_cons_list)) {
801                 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, err_Reference_DiffConsortAuthors,
802                     "Consortium author names differ. Original is \"" << old_cons_list << "\". MedArch's is \"" << new_cons_list << "\".");
803             }
804         }
805 
806         if (num_names == 0) {
807             return true;
808         }
809     }
810 
811     list<CTempString> new_author_names;
812     GetFirstTenNames(new_cit.GetAuthors().GetNames().GetStd(), new_author_names);
813     size_t match = 0;
814 
815     for (auto& name: cit.GetAuthors().GetNames().GetStd())
816     {
817         const CAuthor& auth = *name;
818         if (auth.IsSetName() && auth.GetName().IsName() && auth.GetName().GetName().IsSetLast()) {
819 
820             const string& last_name = auth.GetName().GetName().GetLast();
821             if (find_if(new_author_names.begin(), new_author_names.end(),
822                 [&last_name](const CTempString& cur_name)
823                 {
824                     return NStr::EqualNocase(last_name, cur_name);
825                 }) != new_author_names.end()) {
826 
827                 ++match;
828             }
829         }
830     }
831 
832     size_t min_num_names = min(num_names, new_author_names.size());
833     if (min_num_names > MAX_MATCH_COEFF * match) {
834         return false;
835     }
836 
837     bool replace_authors = new_num_names == 0;
838     if (!replace_authors && new_num_names < num_names) {
839         // Check the last author from PubMed. If it is "et al" - leave the old authors list
840         const CAuthor& last_author = *new_cit.GetAuthors().GetNames().GetStd().back();
841         if (last_author.IsSetName() && last_author.GetName().IsName()) {
842 
843             const CName_std& name = last_author.GetName().GetName();
844             string last_name = name.IsSetLast() ? name.GetLast() : "",
845                    initials = name.IsSetInitials() ? name.GetInitials() : "";
846 
847             replace_authors = NStr::EqualNocase(last_name, "et") &&
848                               NStr::EqualNocase(initials, "al");
849         }
850 
851         // If the last author does not contain "et al", look at the amount of authors
852         // This is done according to the next document:
853         // ~cavanaug/WORK/MedArch/doc.medarch.4genbank.txt
854         //
855         //    If the MedArchCitArt has zero Name-std Author.name ...
856         //
857         //    Or if the InputCitArt has more than 10 Name - std Author.name while
858         //    the MedArchCitArt has less than 12 ...
859         //
860         //    Or if the InputCitArt has more than 25 Name - std Author.name while
861         //    the MedArchCitArt has less than 27 ...
862         //
863         //    Then free the Auth - list of the MedArchCitArt and replace it with
864         //     the Auth - list of the InputCitArt, and **null out** the Auth - list
865         //     of the MedArchCitArt .
866         if (!replace_authors)
867         {
868             static const int MIN_FIRST_AUTHORS_THRESHOLD_1995 = 10;
869             static const int MAX_FIRST_AUTHORS_THRESHOLD_1995 = 12;
870 
871             static const int MIN_SECOND_AUTHORS_THRESHOLD_1999 = 25;
872             static const int MAX_SECOND_AUTHORS_THRESHOLD_1999 = 27;
873 
874             replace_authors = (new_num_names < MAX_FIRST_AUTHORS_THRESHOLD_1995 && num_names > MIN_FIRST_AUTHORS_THRESHOLD_1995) ||
875                               (new_num_names < MAX_SECOND_AUTHORS_THRESHOLD_1999 && num_names > MIN_SECOND_AUTHORS_THRESHOLD_1999);
876         }
877     }
878 
879     if (replace_authors) {
880         new_cit.SetAuthors(cit.SetAuthors());
881         cit.ResetAuthors();
882     }
883 
884     return true;
885 }
886 
887 
MergeNonPubmedPubIds(const CCit_art & cit_old,CCit_art & cit_new)888 void MergeNonPubmedPubIds(const CCit_art& cit_old, CCit_art& cit_new)
889 {
890     if (!cit_old.IsSetIds()) {
891         return;
892     }
893 
894     const CArticleIdSet& old_ids = cit_old.GetIds();
895 
896     for (auto& cur_id: old_ids.Get()) {
897 
898         if (!cur_id->IsDoi() && !cur_id->IsOther()) {
899             continue;
900         }
901 
902         bool found = false;
903         if (cit_new.IsSetIds()) {
904 
905             auto& new_ids = cit_new.GetIds().Get();
906             found = find_if(new_ids.begin(), new_ids.end(),
907                 [&cur_id](const CRef<CArticleId>& new_id)
908             {
909                 if (cur_id->Which() != new_id->Which()) {
910                     return false;
911                 }
912 
913                 if (new_id->IsDoi()) {
914                     return true;
915                 }
916 
917                 bool res = cur_id->GetOther().IsSetDb() == new_id->GetOther().IsSetDb();
918                 if (res && cur_id->GetOther().IsSetDb()) {
919                     res = cur_id->GetOther().GetDb() == new_id->GetOther().GetDb();
920                 }
921                 return res;
922             }) != new_ids.end();
923         }
924 
925         if (!found) {
926             cit_new.SetIds().Set().push_front(cur_id);
927         }
928     }
929 }
930 
931 
NeedToPropagateInJournal(const CCit_art & cit_art)932 bool NeedToPropagateInJournal(const CCit_art& cit_art)
933 {
934     if (!cit_art.IsSetFrom() || !cit_art.GetFrom().IsJournal() ||
935         !cit_art.GetFrom().GetJournal().IsSetTitle() || !cit_art.GetFrom().GetJournal().GetTitle().IsSet() ||
936         cit_art.GetFrom().GetJournal().GetTitle().Get().empty()) {
937         return true;
938     }
939 
940     const CCit_jour& journal = cit_art.GetFrom().GetJournal();
941     if (!journal.IsSetImp()) {
942         return true;
943     }
944 
945     if (!journal.GetImp().IsSetVolume() || !journal.GetImp().IsSetPages() || !journal.GetImp().IsSetDate()) {
946         return true;
947     }
948 
949     return false;
950 }
951 
952 
PropagateInPress(bool inpress,CCit_art & cit_art)953 void PropagateInPress(bool inpress, CCit_art& cit_art)
954 {
955     if (!inpress)
956         return;
957 
958     if (!cit_art.IsSetFrom() || !NeedToPropagateInJournal(cit_art)) {
959         return;
960     }
961 
962     CImprint* imprint = nullptr;
963 
964     switch (cit_art.GetFrom().Which()) {
965 
966     case CCit_art::C_From::e_Journal:
967         if (cit_art.GetFrom().GetJournal().IsSetImp()) {
968             imprint = &cit_art.SetFrom().SetJournal().SetImp();
969         }
970         break;
971 
972     case CCit_art::C_From::e_Book:
973         if (cit_art.GetFrom().GetBook().IsSetImp()) {
974             imprint = &cit_art.SetFrom().SetBook().SetImp();
975         }
976         break;
977 
978     case CCit_art::C_From::e_Proc:
979         if (cit_art.GetFrom().GetProc().IsSetBook() && cit_art.GetFrom().GetProc().GetBook().IsSetImp()) {
980             imprint = &cit_art.SetFrom().SetProc().SetBook().SetImp();
981         }
982         break;
983 
984     default:; // do nothing
985     }
986 
987     if (imprint) {
988         imprint->SetPrepub(CImprint::ePrepub_in_press);
989     }
990 }
991 
992 }
993 
994 using namespace fix_pub;
995 
FixPubEquiv(CPub_equiv & pub_equiv)996 void CPubFix::FixPubEquiv(CPub_equiv& pub_equiv)
997 {
998     CPub_equiv::Tdata muids,
999         pmids,
1000         medlines,
1001         others,
1002         cit_arts;
1003 
1004     if (pub_equiv.IsSet()) {
1005         for (auto& pub: pub_equiv.Set())
1006         {
1007             if (pub->IsMuid()) {
1008                 muids.push_back(pub);
1009             }
1010             else if (pub->IsPmid()) {
1011                 pmids.push_back(pub);
1012             }
1013             else if (pub->IsArticle()) {
1014                 if (IsFromBook(pub->GetArticle())) {
1015                     others.push_back(pub);
1016                 }
1017                 else {
1018                     cit_arts.push_back(pub);
1019                 }
1020             }
1021             else if (pub->IsMedline()) {
1022                 medlines.push_back(pub);
1023             }
1024             else {
1025                 others.push_back(pub);
1026             }
1027         }
1028     }
1029 
1030     auto& pub_list = pub_equiv.Set();
1031     pub_list.clear();
1032 
1033     if ((!muids.empty() || !pmids.empty()) && !m_always_lookup) {
1034         // pmid or muid is present
1035         pub_list.splice(pub_list.end(), cit_arts);
1036         pub_list.splice(pub_list.end(), muids);
1037         pub_list.splice(pub_list.end(), pmids);
1038         pub_list.splice(pub_list.end(), medlines);
1039         pub_list.splice(pub_list.end(), others);
1040         return;
1041     }
1042 
1043     pub_list.splice(pub_list.end(), others);
1044 
1045     if (!medlines.empty())
1046     {
1047         if (medlines.size() > 1) {
1048             ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_ref, "More than one Medline entry in Pub-equiv");
1049             medlines.resize(1);
1050         }
1051 
1052         SplitMedlineEntry(medlines);
1053         pub_list.splice(pub_list.end(), medlines);
1054     }
1055 
1056     TEntrezId oldpmid = ZERO_ENTREZ_ID;
1057     if (!pmids.empty()) {
1058 
1059         oldpmid = pmids.front()->GetPmid();
1060 
1061         // check if more than one
1062         for (auto& pub: pmids) {
1063             if (pub->GetPmid() != oldpmid) {
1064                 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_pmid,
1065                     "Two different pmids in Pub-equiv [" << oldpmid << "] [" << pub->GetPmid() << "]");
1066             }
1067         }
1068         pmids.resize(1);
1069     }
1070 
1071     TEntrezId oldmuid = ZERO_ENTREZ_ID;
1072     if (!muids.empty()) {
1073 
1074         oldmuid = muids.front()->GetMuid();
1075 
1076         // check if more than one
1077         for (auto& pub : muids) {
1078             if (pub->GetMuid() != oldmuid) {
1079                 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_pmid,
1080                     "Two different muids in Pub-equiv  [" << oldmuid << "] [" << pub->GetMuid() << "]");
1081             }
1082         }
1083         muids.resize(1);
1084     }
1085 
1086     if (!cit_arts.empty()) {
1087         if (cit_arts.size() > 1) {
1088             // ditch extras
1089             ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_ref, "More than one Cit-art in Pub-equiv");
1090             cit_arts.resize(1);
1091         }
1092 
1093         CCit_art* cit_art = &cit_arts.front()->SetArticle();
1094         bool inpress = IsInpress(*cit_art);
1095 
1096         CRef<CPub> new_pub(new CPub);
1097         new_pub->SetArticle(*cit_art);
1098 
1099         TEntrezId pmid = ZERO_ENTREZ_ID;
1100         try {
1101             CMLAClient mla;
1102             pmid = ENTREZ_ID_FROM(int, mla.AskCitmatchpmid(*new_pub));
1103         }
1104         catch (exception &) {
1105             // pmid == 0
1106         }
1107 
1108         if ( pmid != ZERO_ENTREZ_ID ) {
1109 
1110             PrintPub(*cit_art, true, false, ENTREZ_ID_TO(long, pmid), m_err_log);
1111 
1112             if (oldpmid > ZERO_ENTREZ_ID && oldpmid != pmid) {
1113                 // already had a pmid
1114                 ERR_POST_TO_LISTENER(m_err_log, eDiag_Error, err_Reference, err_Reference_PmidMissmatch,
1115                     "OldPMID=" << oldpmid << " doesn't match lookup (" << pmid << "). Keeping lookup.");
1116             }
1117 
1118             bool set_pmid = true;
1119             if (m_replace_cit) {
1120 
1121                 CRef<CCit_art> new_cit_art = FetchPubPmId(pmid);
1122 
1123                 if (new_cit_art.NotEmpty()) {
1124 
1125                     bool new_cit_is_valid(false);
1126                     if (CAuthListValidator::enabled) {
1127                         CAuthListValidator::EOutcome outcome = m_authlist_validator.validate(*cit_art, *new_cit_art);
1128                         switch (outcome) {
1129                         case CAuthListValidator::eAccept_pubmed:
1130                             new_cit_is_valid = true;
1131                             break;
1132                         case CAuthListValidator::eKeep_genbank:
1133                             new_cit_art->SetAuthors(cit_art->SetAuthors());
1134                             cit_art->ResetAuthors();
1135                             new_cit_is_valid = true;
1136                             break;
1137                         case CAuthListValidator::eFailed_validation:
1138                             new_cit_is_valid = false;
1139                             break;
1140                         default:
1141                             throw logic_error("Invalid outcome returned by CAuthListValidator::validate(): " + std::to_string(outcome));
1142                         }
1143                     }
1144                     else {
1145                         new_cit_is_valid = TenAuthorsProcess(*cit_art, *new_cit_art, m_err_log);
1146                     }
1147 
1148                     if (new_cit_is_valid) {
1149                         if (pmids.empty()) {
1150                             CRef<CPub> pmid_pub(new CPub);
1151                             pmids.push_back(pmid_pub);
1152                         }
1153 
1154                         pmids.front()->SetPmid().Set(pmid);
1155                         pub_list.splice(pub_list.end(), pmids);
1156 
1157                         CRef<CPub> cit_pub(new CPub);
1158                         cit_pub->SetArticle(*new_cit_art);
1159                         pub_list.push_back(cit_pub);
1160 
1161                         if (m_merge_ids) {
1162                             MergeNonPubmedPubIds(*cit_art, cit_pub->SetArticle());
1163                         }
1164 
1165                         cit_arts.clear();
1166                         cit_arts.push_back(cit_pub);
1167                         cit_art = new_cit_art;
1168                     }
1169                     else {
1170                         pmids.clear();
1171 
1172                         PrintPub(*cit_art, false, true, ENTREZ_ID_TO(long, pmid), m_err_log);
1173                         pub_list.splice(pub_list.end(), cit_arts);
1174                     }
1175 
1176                     set_pmid = false;
1177                 }
1178                 else {
1179                     ERR_POST_TO_LISTENER(m_err_log, eDiag_Error, err_Reference, err_Reference_FailedToGetPub,
1180                         "Failed to get pub from MedArch server for pmid = " << pmid << ". Input one is preserved.");
1181                 }
1182             }
1183 
1184             if (set_pmid) {
1185                 if (pmids.empty()) {
1186                     CRef<CPub> pmid_pub(new CPub);
1187                     pmids.push_back(pmid_pub);
1188                 }
1189 
1190                 pmids.front()->SetPmid().Set(pmid);
1191                 pub_list.splice(pub_list.end(), pmids);
1192 
1193                 MedlineToISO(*cit_art);
1194 
1195                 pub_list.splice(pub_list.end(), cit_arts);
1196             }
1197 
1198             PropagateInPress(inpress, *cit_art);
1199             return;
1200         }
1201 
1202         PrintPub(*cit_art, false, false, ENTREZ_ID_TO(long, oldpmid), m_err_log);
1203         PropagateInPress(inpress, *cit_art);
1204         pub_list.splice(pub_list.end(), cit_arts);
1205 
1206         return;
1207     }
1208 
1209     if (oldpmid != ZERO_ENTREZ_ID) {
1210         // have a pmid but no cit-art
1211 
1212         CRef<CCit_art> new_cit_art = FetchPubPmId(oldpmid);
1213 
1214         if (new_cit_art.NotEmpty()) {
1215 
1216             pub_list.splice(pub_list.end(), pmids);
1217 
1218             if (m_replace_cit) {
1219                 MedlineToISO(*new_cit_art);
1220                 CRef<CPub> cit_pub(new CPub);
1221                 cit_pub->SetArticle(*new_cit_art);
1222                 pub_list.push_back(cit_pub);
1223             }
1224 
1225             return;
1226         }
1227         ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_No_reference,
1228             "Cant find article for pmid [" << oldpmid << "]");
1229     }
1230 
1231     if (oldpmid > ZERO_ENTREZ_ID) {
1232         pub_list.splice(pub_list.end(), pmids);
1233     }
1234     else if (oldmuid > ZERO_ENTREZ_ID) {
1235         pub_list.splice(pub_list.end(), muids);
1236     }
1237 }
1238 
1239 
1240 // Tries to make any Pub into muid / cit - art
FixPub(CPub & pub)1241 void CPubFix::FixPub(CPub& pub)
1242 {
1243     switch (pub.Which()) {
1244 
1245     case CPub::e_Medline:
1246     {
1247         CRef<CPub_equiv> pub_equiv(new CPub_equiv);
1248         pub_equiv->Set().push_back(CRef<CPub>(new CPub));
1249         pub_equiv->Set().front()->Assign(pub);
1250 
1251         SplitMedlineEntry(pub_equiv->Set());
1252         pub.SetEquiv().Assign(*pub_equiv);
1253     }
1254     break;
1255 
1256     case CPub::e_Article:
1257     {
1258         CCit_art& cit_art = pub.SetArticle();
1259         if (cit_art.IsSetFrom() && cit_art.GetFrom().IsBook()) {
1260             return;
1261         }
1262 
1263         TEntrezId pmid = ZERO_ENTREZ_ID;
1264         try {
1265             CMLAClient mla;
1266             pmid = ENTREZ_ID_FROM(int, mla.AskCitmatchpmid(pub));
1267         }
1268         catch (exception &) {
1269             // pmid == 0;
1270         }
1271 
1272         if (pmid > ZERO_ENTREZ_ID) {
1273             PrintPub(cit_art, true, false, ENTREZ_ID_TO(long, pmid), m_err_log);
1274             if (m_replace_cit) {
1275                 CRef<CCit_art> new_cit_art = FetchPubPmId(pmid);
1276 
1277                 if (new_cit_art.NotEmpty()) {
1278                     if (TenAuthorsProcess(cit_art, *new_cit_art, m_err_log)) {
1279 
1280                         if (m_merge_ids) {
1281                             MergeNonPubmedPubIds(*new_cit_art, cit_art);
1282                         }
1283 
1284                         CRef<CPub> new_pub(new CPub);
1285                         new_pub->SetArticle(*new_cit_art);
1286                         pub.SetEquiv().Set().push_back(new_pub);
1287 
1288                         new_pub.Reset(new CPub);
1289                         new_pub->SetPmid().Set(pmid);
1290                         pub.SetEquiv().Set().push_back(new_pub);
1291                     }
1292                     else {
1293                         PrintPub(cit_art, false, true, ENTREZ_ID_TO(long, pmid), m_err_log);
1294                         MedlineToISO(cit_art);
1295                     }
1296                 }
1297             }
1298             else {
1299                 PrintPub(cit_art, false, false, ENTREZ_ID_TO(long, pmid), m_err_log);
1300                 MedlineToISO(cit_art);
1301             }
1302         }
1303     }
1304     break;
1305 
1306     case CPub::e_Equiv:
1307         FixPubEquiv(pub.SetEquiv());
1308         break;
1309 
1310     default:; // do nothing
1311     }
1312 }
1313 
FetchPubPmId(TEntrezId pmid)1314 CRef<CCit_art> CPubFix::FetchPubPmId(TEntrezId pmid)
1315 {
1316     CRef<CCit_art> cit_art;
1317     if (pmid < ZERO_ENTREZ_ID)
1318         return cit_art;
1319 
1320     CRef<CPub> pub;
1321     try {
1322         CMLAClient mla;
1323         pub = mla.AskGetpubpmid(CPubMedId(pmid));
1324     }
1325     catch (exception &) {
1326         pub.Reset();
1327     }
1328 
1329     if (pub.NotEmpty() && pub->IsArticle()) {
1330         cit_art.Reset(new CCit_art);
1331         cit_art->Assign(pub->GetArticle());
1332 
1333         MedlineToISO(*cit_art);
1334     }
1335 
1336     return cit_art;
1337 }
1338 
1339 bool CAuthListValidator::enabled = true; // Verified in ID-6550, so set to use it by default
1340                                          // Setting it to false would lead to a few bugs
1341 bool CAuthListValidator::configured = false;
1342 double CAuthListValidator::cfg_matched_to_min = 0.3333;
1343 double CAuthListValidator::cfg_removed_to_gb = 0.3333;
Configure(const CNcbiRegistry & cfg,const string & section)1344 void CAuthListValidator::Configure(const CNcbiRegistry& cfg, const string& section)
1345 {
1346     enabled = cfg.GetBool(section, "enabled", enabled);
1347     cfg_matched_to_min = cfg.GetDouble(section, "matched_to_min", cfg_matched_to_min);
1348     cfg_removed_to_gb = cfg.GetDouble(section, "removed_to_gb", cfg_removed_to_gb);
1349     configured = true;
1350 }
1351 
CAuthListValidator(IMessageListener * err_log)1352 CAuthListValidator::CAuthListValidator(IMessageListener* err_log)
1353     : outcome(eNotSet), pub_year(0), reported_limit("not initialized"), m_err_log(err_log)
1354 {
1355     if (! configured) {
1356         Configure(CNcbiApplication::Instance()->GetConfig(), "auth_list_validator");
1357     }
1358 }
1359 
validate(const CCit_art & gb_art,const CCit_art & pm_art)1360 CAuthListValidator::EOutcome CAuthListValidator::validate(const CCit_art& gb_art, const CCit_art& pm_art)
1361 {
1362     outcome = eNotSet;
1363     pub_year = 0;
1364     pub_year = pm_art.GetFrom().GetJournal().GetImp().GetDate().GetStd().GetYear();
1365     if (pub_year < 1900 || pub_year > 3000) {
1366         throw logic_error("Publication from PubMed has invalid year: " + std::to_string(pub_year));
1367     }
1368     gb_type = CAuth_list::C_Names::SelectionName(gb_art.GetAuthors().GetNames().Which());
1369     get_lastnames(gb_art.GetAuthors(), removed, gb_auth_string);
1370     pm_type = CAuth_list::C_Names::SelectionName(pm_art.GetAuthors().GetNames().Which());
1371     get_lastnames(pm_art.GetAuthors(), added, pm_auth_string);
1372     matched.clear();
1373     compare_lastnames();
1374     actual_matched_to_min = double(cnt_matched) / cnt_min;
1375     actual_removed_to_gb = double(cnt_removed) / cnt_gb;
1376     if (actual_removed_to_gb > cfg_removed_to_gb) {
1377         ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_AuthList, err_AuthList_SignificantDrop,
1378             "Too many authors removed (" << cnt_removed << ") compared to total Genbank authors (" << cnt_gb << ")");
1379     }
1380     // determine outcome according to ID-6514 (see fix_pub.hpp)
1381     if (pub_year > 1999) {
1382         reported_limit = "Unlimited";
1383         outcome = eAccept_pubmed;
1384     }
1385     else if (pub_year > 1995) {
1386         reported_limit = "25 authors";
1387         if (cnt_gb > 25) {
1388             ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_AuthList, err_AuthList_PreserveGB,
1389                 "Preserving original " << cnt_gb << " GB authors, ignoring " << cnt_pm << " PubMed authors "
1390                 << "(PubMed limit was " << reported_limit << " in pub.year " << pub_year << ")");
1391             outcome = eKeep_genbank;
1392         }
1393         else {
1394             outcome = eAccept_pubmed;
1395         }
1396     }
1397     else { // pub_year < 1996
1398         reported_limit = "10 authors";
1399         if (cnt_gb > 10) {
1400             ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_AuthList, err_AuthList_PreserveGB,
1401                 "Preserving original " << cnt_gb << " GB authors, ignoring " << cnt_pm << " PubMed authors "
1402                 << "(PubMed limit was " << reported_limit << " in pub.year " << pub_year << ")");
1403             outcome = eKeep_genbank;
1404         }
1405         else {
1406             outcome = eAccept_pubmed;
1407         }
1408     }
1409     // check minimum required # of matching authors
1410     if (actual_matched_to_min < cfg_matched_to_min) {
1411         ERR_POST_TO_LISTENER(m_err_log, eDiag_Error, err_AuthList, err_AuthList_LowMatch,
1412             "Only " << cnt_matched << " authors matched between " << cnt_gb << " Genbank and "
1413             << cnt_pm << " PubMed. Match/Min ratio " << fixed << setprecision(2) << actual_matched_to_min
1414             << " is below threshold " << fixed << setprecision(2) << cfg_matched_to_min);
1415         outcome = eFailed_validation;
1416     }
1417     return outcome;
1418 }
1419 
DebugDump(CNcbiOstream & out) const1420 void CAuthListValidator::DebugDump(CNcbiOstream& out) const
1421 {
1422     out << "\n--- Debug Dump of CAuthListValidator object ---\n";
1423     out << "pub_year: " << pub_year << "\n";
1424     out << "PubMed Auth-list limit in " << pub_year << ": " << reported_limit << "\n";
1425     out << "Configured ratio 'matched' to 'min(gb,pm)': " << cfg_matched_to_min
1426         << "; actual: " << actual_matched_to_min << "\n";
1427     out << "Configured ratio 'removed' to 'gb': " << cfg_removed_to_gb
1428         << "; actual: " << actual_removed_to_gb << "\n";
1429     out << "GB author list type: " << gb_type << "; # of entries: " << cnt_gb << "\n";
1430     out << "PM author list type: " << pm_type << "; # of entries: " << cnt_pm << "\n";
1431     dumplist("Matched", matched, out);
1432     dumplist("Added", added, out);
1433     dumplist("Removed", removed, out);
1434     const char* outcome_names[] = {"NotSet", "Failed_validation", "Accept_pubmed", "Keep_genbank"};
1435     out << "Outcome reported: " << outcome_names[outcome] << "(" << outcome << ")\n";
1436     out << "--- End of Debug Dump of CAuthListValidator object ---\n\n";
1437 }
1438 
dumplist(const char * hdr,const list<string> & lst,CNcbiOstream & out) const1439 void CAuthListValidator::dumplist(const char* hdr, const list<string>& lst, CNcbiOstream& out) const
1440 {
1441     out << lst.size() << " " << hdr << " authors:\n";
1442     for (const auto& a : lst)
1443         out << "    " << a << "\n";
1444 }
1445 
compare_lastnames()1446 void CAuthListValidator::compare_lastnames()
1447 {
1448     auto gbit = removed.begin();
1449     while (gbit != removed.end()) {
1450         list<string>::iterator gbnext(gbit);
1451         ++gbnext;
1452         list<string>::iterator pmit = std::find(added.begin(), added.end(), *gbit);
1453         if (pmit != added.end()) {
1454             matched.push_back(*gbit);
1455             removed.erase(gbit++);
1456             added.erase(pmit);
1457         }
1458         gbit = gbnext;
1459     }
1460     cnt_matched = matched.size();
1461     cnt_removed = removed.size();
1462     cnt_added = added.size();
1463     cnt_gb = cnt_matched + cnt_removed;
1464     cnt_pm = cnt_matched + cnt_added;
1465     cnt_min = min(cnt_gb, cnt_pm);
1466 }
1467 
1468 
get_lastnames(const CAuth_list & authors,list<string> & lastnames,string & auth_string)1469 void CAuthListValidator::get_lastnames(const CAuth_list& authors, list<string>& lastnames, string& auth_string)
1470 {
1471     lastnames.clear();
1472     switch (authors.GetNames().Which()) {
1473     case CAuth_list::C_Names::e_Std:
1474         get_lastnames(authors.GetNames().GetStd(), lastnames);
1475         break;
1476     case CAuth_list::C_Names::e_Ml:
1477         {{
1478             CRef< CAuth_list > authlist_std;
1479             authlist_std->Assign(authors);
1480             authlist_std->ConvertMlToStandard();
1481             get_lastnames(authlist_std->GetNames().GetStd(), lastnames);
1482         }}
1483         break;
1484     case CAuth_list::C_Names::e_Str:
1485         get_lastnames(authors.GetNames().GetStr(), lastnames);
1486         break;
1487     default:
1488         throw logic_error("Unexpected CAuth_list::C_Name choice: " + CAuth_list::C_Names::SelectionName(authors.GetNames().Which()));
1489     }
1490     auth_string = NStr::Join(lastnames, "; ");
1491 }
1492 
get_lastnames(const CAuth_list::C_Names::TStd & authors,list<string> & lastnames)1493 void CAuthListValidator::get_lastnames(const CAuth_list::C_Names::TStd& authors, list<string>& lastnames)
1494 {
1495     for (auto& name : authors) {
1496         if (name->IsSetName() && name->GetName().IsName() && name->GetName().GetName().IsSetLast()) {
1497             string lname(name->GetName().GetName().GetLast());
1498             lastnames.push_back(NStr::ToLower(lname));
1499         }
1500     }
1501 }
1502 
get_lastnames(const CAuth_list::C_Names::TStr & authors,list<string> & lastnames)1503 void CAuthListValidator::get_lastnames(const CAuth_list::C_Names::TStr& authors, list<string>& lastnames)
1504 {
1505     const char* alpha = "abcdefghijklmnopqrstuvwxyz";
1506     for (auto auth : authors) {
1507         size_t eow = NStr::ToLower(auth).find_first_not_of(alpha);
1508         lastnames.push_back(auth.substr(0, eow));
1509     }
1510 }
1511 
1512 END_SCOPE(edit)
1513 END_SCOPE(objects)
1514 END_NCBI_SCOPE
1515