1 /* $Id: pub_fix.cpp 632623 2021-06-03 17:38:11Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Alexey Dobronadezhdin
27 *
28 * File Description:
29 * Code for fixing up publications.
30 * MedArch lookup and post-processing utilities.
31 * Based on medutil.c written by James Ostell.
32 */
33
34 #include <ncbi_pch.hpp>
35
36 #include <objects/biblio/ArticleId.hpp>
37 #include <objects/biblio/ArticleIdSet.hpp>
38 #include <objects/biblio/Author.hpp>
39 #include <objects/biblio/Cit_art.hpp>
40 #include <objects/biblio/Cit_book.hpp>
41 #include <objects/biblio/Cit_proc.hpp>
42 #include <objects/biblio/Cit_jour.hpp>
43 #include <objects/biblio/Imprint.hpp>
44 #include <objects/biblio/Title.hpp>
45 #include <objects/general/Name_std.hpp>
46 #include <objects/general/Person_id.hpp>
47 #include <objects/general/Date.hpp>
48 #include <objects/general/Date_std.hpp>
49 #include <objects/general/Dbtag.hpp>
50 #include <objects/medline/Medline_entry.hpp>
51
52 #include <objects/pub/Pub.hpp>
53
54 #include <objtools/edit/pub_fix.hpp>
55
56 #include "pub_fix_aux.hpp"
57
58 #include <objects/mla/Title_msg.hpp>
59 #include <objects/mla/Title_msg_list.hpp>
60 #include <objects/mla/mla_client.hpp>
61
62 #include <corelib/ncbi_message.hpp>
63 #include <objtools/eutils/api/esearch.hpp>
64 #include <objtools/eutils/esearch/IdList.hpp>
65 #include <objtools/eutils/api/esummary.hpp>
66
67
68 BEGIN_NCBI_SCOPE
69 BEGIN_SCOPE(objects)
70 BEGIN_SCOPE(edit)
71
72 #define ERR_POST_TO_LISTENER(listener, severity, code, subcode, message) \
73 do { \
74 if (listener) { \
75 ostringstream ostr; \
76 ostr << message; \
77 string text = ostr.str(); \
78 CMessage_Basic msg(text, severity, code, subcode); \
79 listener->PostMessage(msg); \
80 } \
81 } while (false)
82
83 namespace fix_pub
84 {
85 struct SErrorSubcodes
86 {
87 string m_error_str;
88 map<int, string> m_sub_errors;
89 };
90
91 static map<int, SErrorSubcodes> ERROR_CODE_STR =
92 {
93 // I'm using it in blob_maint application. The string REFERENCE is not informative, changing to FixPub.
94 { err_Reference,{ "FixPub",
95 {
96 { err_Reference_MuidNotFound, "MuidNotFound" },
97 { err_Reference_SuccessfulMuidLookup, "SuccessfulMuidLookup" },
98 { err_Reference_OldInPress, "OldInPress" },
99 { err_Reference_No_reference, "No_reference" },
100 { err_Reference_Multiple_ref, "Multiple_ref" },
101 { err_Reference_Multiple_muid, "Multiple_muid" },
102 { err_Reference_MedlineMatchIgnored, "MedlineMatchIgnored" },
103 { err_Reference_MuidMissmatch, "MuidMissmatch" },
104 { err_Reference_NoConsortAuthors, "NoConsortAuthors" },
105 { err_Reference_DiffConsortAuthors, "DiffConsortAuthors" },
106 { err_Reference_PmidMissmatch, "PmidMissmatch" },
107 { err_Reference_Multiple_pmid, "Multiple_pmid" },
108 { err_Reference_FailedToGetPub, "FailedToGetPub" },
109 { err_Reference_MedArchMatchIgnored, "MedArchMatchIgnored" },
110 { err_Reference_SuccessfulPmidLookup, "SuccessfulPmidLookup" },
111 { err_Reference_PmidNotFound, "PmidNotFound" },
112 { err_Reference_NoPmidJournalNotInPubMed, "NoPmidJournalNotInPubMed" },
113 { err_Reference_PmidNotFoundInPress, "PmidNotFoundInPress" },
114 { err_Reference_NoPmidJournalNotInPubMedInPress, "NoPmidJournalNotInPubMedInPress" }
115 }
116 } },
117 { err_Print,{ "PRINT",
118 {
119 { err_Print_Failed, "Failed" }
120 }
121 } },
122 { err_AuthList,{ "AuthList",
123 {
124 { err_AuthList_SignificantDrop, "SignificantDrop" },
125 { err_AuthList_PreserveGB, "PreserveGB" },
126 { err_AuthList_LowMatch, "LowMatch" }
127 }
128 } }
129 };
130 }
131
GetErrorId(int err_code,int err_sub_code)132 string CPubFix::GetErrorId(int err_code, int err_sub_code)
133 {
134 string ret;
135
136 const auto& err_category = fix_pub::ERROR_CODE_STR.find(err_code);
137 if (err_category != fix_pub::ERROR_CODE_STR.end()) {
138
139 const auto& error_sub_code_str = err_category->second.m_sub_errors.find(err_sub_code);
140 if (error_sub_code_str != err_category->second.m_sub_errors.end()) {
141 ret = err_category->second.m_error_str;
142 ret += '.';
143 ret += error_sub_code_str->second;
144 }
145 }
146
147 return ret;
148 }
149
150
151 namespace fix_pub
152 {
153 // MedlineToISO(tmp)
154 // converts a MEDLINE citation to ISO/GenBank style
155
MedlineToISO(CCit_art & cit_art)156 void MedlineToISO(CCit_art& cit_art)
157 {
158 if (cit_art.IsSetAuthors()) {
159
160 CAuth_list& auths = cit_art.SetAuthors();
161 if (auths.IsSetNames()) {
162 if (auths.GetNames().IsMl()) {
163 auths.ConvertMlToStandard();
164 }
165 else if (auths.GetNames().IsStd()) {
166 for (auto& auth : auths.SetNames().SetStd()) {
167 if (auth->IsSetName() && auth->GetName().IsMl()) {
168 auth = CAuthor::ConvertMlToStandard(*auth);
169 }
170 }
171 }
172 }
173 }
174
175 if (!cit_art.IsSetFrom() || !cit_art.GetFrom().IsJournal())
176 return;
177
178 // from a journal - get iso_jta
179 CCit_jour& journal = cit_art.SetFrom().SetJournal();
180
181 auto IsIso_jta = [](const CRef<CTitle::C_E>& title) -> bool { return title->IsIso_jta(); };
182
183 if (journal.IsSetTitle() && journal.GetTitle().IsSet()) {
184
185 auto& titles = journal.SetTitle().Set();
186
187 if (find_if(titles.begin(), titles.end(), IsIso_jta) == titles.end()) {
188 // no iso_jta
189
190 CTitle::C_E& first_title = *titles.front();
191 const string& title_str = journal.SetTitle().GetTitle(first_title);
192
193 CRef<CTitle> title_new(new CTitle);
194 CRef<CTitle::C_E> type_new(new CTitle::C_E);
195 type_new->SetIso_jta(title_str);
196 title_new->Set().push_back(type_new);
197
198 CRef<CTitle_msg> msg_new(new CTitle_msg);
199 msg_new->SetType(eTitle_type_iso_jta);
200 msg_new->SetTitle(*title_new);
201
202 CRef<CTitle_msg_list> msg_list_new;
203 try {
204 CMLAClient mla;
205 msg_list_new = mla.AskGettitle(*msg_new);
206 }
207 catch (exception &) {
208 // msg_list_new stays empty
209 }
210
211 if (msg_list_new.NotEmpty() && msg_list_new->IsSetTitles()) {
212
213 bool gotit = false;
214 for (auto& item : msg_list_new->GetTitles()) {
215 const CTitle &cur_title = item->GetTitle();
216
217 if (cur_title.IsSet()) {
218
219 auto iso_jta_title = find_if(cur_title.Get().begin(), cur_title.Get().end(), IsIso_jta);
220 if (iso_jta_title != cur_title.Get().end()) {
221 gotit = true;
222 first_title.SetIso_jta((*iso_jta_title)->GetIso_jta());
223 break;
224 }
225 }
226
227 if (gotit)
228 break;
229 }
230 }
231 }
232 }
233
234 if (journal.IsSetImp()) {
235 // remove Eng language
236 if (journal.GetImp().IsSetLanguage() && journal.GetImp().GetLanguage() == "Eng")
237 journal.SetImp().ResetLanguage();
238 }
239 }
240
241 // SplitMedlineEntry(mep)
242 // splits a medline entry into 2 pubs (1 muid, 1 Cit-art)
243 // converts Cit-art to ISO/GenBank style
244 // deletes original medline entry
SplitMedlineEntry(CPub_equiv::Tdata & medlines)245 void SplitMedlineEntry(CPub_equiv::Tdata& medlines)
246 {
247 if (medlines.size() != 1) {
248 return;
249 }
250
251 CPub& pub = *medlines.front();
252 CMedline_entry& medline = pub.SetMedline();
253 if (!medline.IsSetCit() && medline.IsSetPmid() && medline.GetPmid() < ZERO_ENTREZ_ID) {
254 return;
255 }
256
257 CRef<CPub> pmid;
258 if (medline.GetPmid() > ZERO_ENTREZ_ID) {
259 pmid.Reset(new CPub);
260 pmid->SetPmid(medline.GetPmid());
261 }
262
263 CRef<CPub> cit_art;
264 if (medline.IsSetCit()) {
265 cit_art.Reset(new CPub);
266 cit_art->SetArticle(medline.SetCit());
267 MedlineToISO(cit_art->SetArticle());
268 }
269
270 medlines.clear();
271
272 if (pmid.NotEmpty())
273 medlines.push_back(pmid);
274
275 if (cit_art.NotEmpty())
276 medlines.push_back(cit_art);
277 }
278
279
IsInpress(const CCit_art & cit_art)280 bool IsInpress(const CCit_art& cit_art)
281 {
282 if (!cit_art.IsSetFrom())
283 return false;
284
285 bool ret = false;
286 if (cit_art.GetFrom().IsJournal()) {
287 const CCit_jour& journal = cit_art.GetFrom().GetJournal();
288 ret = journal.IsSetImp() && journal.GetImp().IsSetPrepub() && journal.GetImp().GetPrepub() == CImprint::ePrepub_in_press;
289 }
290 else if (cit_art.GetFrom().IsBook()) {
291 const CCit_book& book = cit_art.GetFrom().GetBook();
292 ret = book.IsSetImp() && book.GetImp().IsSetPrepub() && book.GetImp().GetPrepub() == CImprint::ePrepub_in_press;
293 }
294 else if (cit_art.GetFrom().IsProc() && cit_art.GetFrom().GetProc().IsSetBook()) {
295 const CCit_book& book = cit_art.GetFrom().GetProc().GetBook();
296 ret = book.IsSetImp() && book.GetImp().IsSetPrepub() && book.GetImp().GetPrepub() == CImprint::ePrepub_in_press;
297 }
298 return ret;
299 }
300
301
MULooksLikeISSN(const string & str)302 bool MULooksLikeISSN(const string& str)
303 {
304 // ISSN: nnnn-nnnn or nnnn-nnnX, where n -> '0'-'9', i.e. 0123-5566
305 static const size_t ISSN_SIZE = 9;
306 static const size_t ISSN_DASH_POS = 4;
307 static const size_t ISSN_X_POS = 8;
308
309 if (NStr::IsBlank(str) || str.size() != ISSN_SIZE || str[ISSN_DASH_POS] != '-') {
310 return false;
311 }
312
313 for (size_t i = 0; i < ISSN_SIZE; ++i) {
314 char ch = str[i];
315 if (isdigit(ch) || (ch == '-' && i == ISSN_DASH_POS) || (ch == 'X' && i == ISSN_X_POS)) {
316 continue;
317 }
318 return false;
319 }
320
321 return true;
322 }
323
324 /*
325 bool MUIsJournalIndexed(const string& journal)
326 {
327 if (journal.empty()) {
328 return false;
329 }
330
331 string title(journal);
332 NStr::ReplaceInPlace(title, "(", " ");
333 NStr::ReplaceInPlace(title, ")", " ");
334 NStr::ReplaceInPlace(title, ".", " ");
335
336 title = NStr::Sanitize(title);
337
338 CEutilsClient eutils;
339
340 static const int MAX_ITEMS = 200;
341 eutils.SetMaxReturn(MAX_ITEMS);
342
343 vector<string> ids;
344
345 static const string EUTILS_DATABASE("nlmcatalog");
346
347 try {
348 if (MULooksLikeISSN(title)) {
349 eutils.Search(EUTILS_DATABASE, title + "[issn]", ids);
350 }
351
352 if (ids.empty()) {
353 eutils.Search(EUTILS_DATABASE, title + "[multi] AND ncbijournals[sb]", ids);
354 }
355
356 if (ids.empty()) {
357 eutils.Search(EUTILS_DATABASE, title + "[jo]", ids);
358 }
359 }
360 catch (CException&) {
361 return false;
362 }
363
364 if (ids.size() != 1) {
365 return false;
366 }
367
368
369 // getting the indexing status of the journal found
370 static const string SUMMARY_VERSION("2.0");
371 xml::document doc;
372 eutils.Summary(EUTILS_DATABASE, ids, doc, SUMMARY_VERSION);
373
374 const xml::node& root_node = doc.get_root_node();
375 xml::node_set nodes(root_node.run_xpath_query("//DocumentSummarySet/DocumentSummary/CurrentIndexingStatus/text()"));
376
377 string status;
378 if (nodes.size() == 1) {
379 status = nodes.begin()->get_content();
380 }
381
382 return status == "Y";
383 }
384 */
385
s_GetESearchIds(CESearch_Request & req,const string & term,list<string> & ids)386 static void s_GetESearchIds(CESearch_Request& req,
387 const string& term,
388 list<string>& ids) {
389 // error handling is modeled on that of CEUtilsClient::x_Search()
390 req.SetArgument("term", term);
391 for (int retry=0; retry<10; ++retry) {
392 try {
393 auto& istr = dynamic_cast<CConn_HttpStream&>(req.GetStream());
394 auto pRes = Ref(new esearch::CESearchResult());
395 istr >> MSerial_Xml >> *pRes;
396
397 if (istr.GetStatusCode() == 200) {
398 if (pRes->IsSetData()) {
399 if (pRes->GetData().IsInfo() &&
400 pRes->GetData().GetInfo().IsSetContent() &&
401 pRes->GetData().GetInfo().GetContent().IsSetIdList()) {
402
403 const auto& idList = pRes->GetData().GetInfo().GetContent().GetIdList();
404 if (idList.IsSetId()) {
405 ids = idList.GetId();
406 }
407 req.Disconnect();
408 return;
409 }
410 else
411 if (pRes->GetData().IsERROR()) {
412 NCBI_THROW(CException, eUnknown,
413 pRes->GetData().GetERROR());
414 }
415 } // pRest->IsSetData()
416 } // istr.GetStatusCode() == 200
417 }
418 catch(CException& e) {
419 ERR_POST(Warning << "failed on attempt " << retry + 1
420 << ": " << e);
421 }
422 req.Disconnect();
423
424 int sleepSeconds = sqrt(retry);
425 if (sleepSeconds) {
426 SleepSec(sleepSeconds);
427 }
428 } // retry
429
430 NCBI_THROW(CException, eUnknown,
431 "failed to execute query: " + term);
432 }
433
434
s_IsIndexed(CRef<CEUtils_ConnContext> pContext,const string & id)435 static bool s_IsIndexed(CRef<CEUtils_ConnContext> pContext,
436 const string& id) {
437
438 // error handling is modeled on that of CEUtilsClient::x_Summary()
439 CESummary_Request request("nlmcatalog", pContext);
440 request.GetId().AddId(id);
441 request.SetArgument("version", "2.0");
442 string xmlOutput;
443 bool success=false;
444 for (int retry=0; retry<10; ++retry) {
445 try {
446 auto& istr = dynamic_cast<CConn_HttpStream&>(request.GetStream());
447 NcbiStreamToString(&xmlOutput, istr);
448 if (istr.GetStatusCode() == 200) {
449 success = true;
450 break;
451 }
452 }
453 catch (...) {
454 }
455 request.Disconnect();
456
457 int sleepSeconds = sqrt(retry);
458 if (sleepSeconds) {
459 SleepSec(sleepSeconds);
460 }
461 }
462
463 if (!success) {
464 NCBI_THROW(CException, eUnknown,
465 "failed to execute esummary request: " + request.GetQueryString());
466 }
467
468 static const string indexingElement { "<CurrentIndexingStatus>Y</CurrentIndexingStatus>" };
469 auto firstPos = NStr::Find(xmlOutput, indexingElement, NStr::eNocase);
470 if (firstPos == NPOS) {
471 return false;
472 }
473 auto lastPos = NStr::Find(xmlOutput, indexingElement, NStr::eNocase, NStr::eReverseSearch);
474
475 return firstPos == lastPos;
476 }
477
478
479
MUIsJournalIndexed(const string & journal)480 bool MUIsJournalIndexed(const string& journal)
481 {
482 if (journal.empty()) {
483 return false;
484 }
485
486 string title(journal);
487 NStr::ReplaceInPlace(title, "(", " ");
488 NStr::ReplaceInPlace(title, ")", " ");
489 NStr::ReplaceInPlace(title, ".", " ");
490
491 title = NStr::Sanitize(title);
492
493 list<string> ids;
494 auto pContext = Ref(new CEUtils_ConnContext());
495 CESearch_Request req("nlmcatalog", pContext);
496 req.SetRetMax(2);
497 req.SetUseHistory(false);
498 try {
499 if (MULooksLikeISSN(title)) {
500 s_GetESearchIds(req, title + "[issn]", ids);
501 }
502
503 if (ids.empty()) {
504 s_GetESearchIds(req, title + "[multi] AND ncbijournals[sb]", ids);
505 }
506
507 if (ids.empty()) {
508 s_GetESearchIds(req, title + "[jo]", ids);
509 }
510 }
511 catch (CException&) {
512 return false;
513 }
514
515 if (ids.size() != 1) {
516 return false;
517 }
518
519 return s_IsIndexed(pContext, ids.front());
520 }
521
522
523
PrintPub(const CCit_art & cit_art,bool found,bool auth,long muid,IMessageListener * err_log)524 void PrintPub(const CCit_art& cit_art, bool found, bool auth, long muid, IMessageListener* err_log)
525 {
526 string first_name,
527 last_name;
528
529 if (cit_art.IsSetAuthors() && cit_art.GetAuthors().IsSetNames()) {
530
531 if (cit_art.GetAuthors().GetNames().IsStd()) {
532
533 const CAuthor& first_author = *cit_art.GetAuthors().GetNames().GetStd().front();
534
535 if (first_author.IsSetName()) {
536 if (first_author.GetName().IsName()) {
537 const CName_std& namestd = first_author.GetName().GetName();
538 if (namestd.IsSetLast()) {
539 last_name = namestd.GetLast();
540 }
541 if (namestd.IsSetInitials()) {
542 first_name = namestd.GetInitials();
543 }
544 }
545 else if (first_author.GetName().IsConsortium()) {
546 last_name = first_author.GetName().GetConsortium();
547 }
548 }
549 }
550 else {
551 last_name = cit_art.GetAuthors().GetNames().GetStr().front();
552 }
553 }
554 else {
555 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Print, err_Print_Failed, "Authors NULL");
556 }
557
558 const CImprint* imprint = nullptr;
559 const CTitle* title = nullptr;
560
561 if (cit_art.IsSetFrom()) {
562 if (cit_art.GetFrom().IsJournal()) {
563 const CCit_jour& journal = cit_art.GetFrom().GetJournal();
564
565 if (journal.IsSetTitle()) {
566 title = &journal.GetTitle();
567 }
568
569 if (journal.IsSetImp()) {
570 imprint = &journal.GetImp();
571 }
572 }
573 else if (cit_art.GetFrom().IsBook()) {
574 const CCit_book& book = cit_art.GetFrom().GetBook();
575
576 if (book.IsSetTitle()) {
577 title = &book.GetTitle();
578 }
579
580 if (book.IsSetImp()) {
581 imprint = &book.GetImp();
582 }
583 }
584 }
585
586 static const string UNKNOWN_JOURNAL("journal unknown");
587 string title_str(UNKNOWN_JOURNAL);
588
589 if (title && title->IsSet() && !title->Get().empty()) {
590
591 const CTitle::C_E& first_title = *title->Get().front();
592 const string& str = title->GetTitle(first_title);
593
594 if (!str.empty())
595 title_str = str;
596 }
597
598
599 static const string NO_PAGE("no page number");
600 static const string NO_VOL("no volume number");
601
602 string vol(NO_VOL),
603 page(NO_PAGE);
604
605 int year = 0;
606 bool in_press = false;
607
608 if (imprint) {
609
610 if (imprint->IsSetVolume()) {
611 vol = imprint->GetVolume();
612 }
613
614 if (imprint->IsSetPages()) {
615 page = imprint->GetPages();
616 }
617
618 if (imprint->IsSetDate() && imprint->GetDate().IsStd() && imprint->GetDate().GetStd().IsSetYear()) {
619 year = imprint->GetDate().GetStd().GetYear();
620 }
621
622 in_press = imprint->IsSetPrepub() && imprint->GetPrepub() == CImprint::ePrepub_in_press;
623 }
624
625 if (auth) {
626 ERR_POST_TO_LISTENER(err_log, eDiag_Error, err_Reference, err_Reference_MedArchMatchIgnored,
627 "Too many author name differences: " << muid << "|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
628 return;
629 }
630
631 if (in_press) {
632
633 int cur_year = CDate_std(CTime(CTime::eCurrent)).GetYear();
634 static const int YEAR_MAX_DIFF = 2;
635
636 if (year && cur_year - year > YEAR_MAX_DIFF) {
637 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, err_Reference_OldInPress,
638 "encountered in-press article more than 2 years old: " << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
639 }
640 }
641
642 if (found) {
643 ERR_POST_TO_LISTENER(err_log, eDiag_Info, err_Reference, err_Reference_SuccessfulPmidLookup,
644 muid << "|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
645 }
646 else if (MUIsJournalIndexed(title_str)) {
647 if (muid) {
648 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, in_press ? err_Reference_PmidNotFoundInPress : err_Reference_PmidNotFound,
649 ">>" << muid << "<<|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
650 }
651 else {
652 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, in_press ? err_Reference_PmidNotFoundInPress : err_Reference_PmidNotFound,
653 last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
654 }
655 }
656 else {
657 if (muid) {
658 ERR_POST_TO_LISTENER(err_log, eDiag_Info, err_Reference, in_press ? err_Reference_NoPmidJournalNotInPubMedInPress : err_Reference_NoPmidJournalNotInPubMed,
659 ">>" << muid << "<<|" << last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
660 }
661 else {
662 ERR_POST_TO_LISTENER(err_log, eDiag_Info, err_Reference, in_press ? err_Reference_NoPmidJournalNotInPubMedInPress : err_Reference_NoPmidJournalNotInPubMed,
663 last_name << " " << first_name << "|" << title_str << "|(" << year << ")|" << vol << "|" << page);
664 }
665 }
666 }
667
668
IsFromBook(const CCit_art & art)669 bool IsFromBook(const CCit_art& art)
670 {
671 return art.IsSetFrom() && art.GetFrom().IsBook();
672 }
673
674
675 static const size_t MAX_MATCH_COEFF = 3;
676
TenAuthorsCompare(CCit_art & cit_old,CCit_art & cit_new)677 bool TenAuthorsCompare(CCit_art& cit_old, CCit_art& cit_new)
678 {
679 _ASSERT(cit_old.IsSetAuthors() && cit_new.IsSetAuthors() &&
680 cit_old.GetAuthors().IsSetNames() && cit_new.GetAuthors().IsSetNames() && "Both arguments should have valid author's names at this point");
681
682 const CAuth_list::C_Names& old_names = cit_old.GetAuthors().GetNames();
683 const CAuth_list::C_Names& new_names = cit_new.GetAuthors().GetNames();
684
685 auto StrNotEmpty = [](const string& str) -> bool { return !str.empty(); };
686 size_t new_num_of_authors = count_if(new_names.GetStr().begin(), new_names.GetStr().end(), StrNotEmpty),
687 num_of_authors = count_if(old_names.GetStr().begin(), old_names.GetStr().end(), StrNotEmpty);
688
689 size_t match = 0;
690 for (auto& name : old_names.GetStr()) {
691
692 if (!name.empty()) {
693 if (NStr::FindNoCase(new_names.GetStr(), name) != nullptr) {
694 ++match;
695 }
696 }
697 }
698
699 size_t min_num_of_authors = min(num_of_authors, new_num_of_authors);
700
701 if (min_num_of_authors > MAX_MATCH_COEFF * match) {
702 return false;
703 }
704
705 static const size_t MAX_AUTHORS = 10;
706 if (min_num_of_authors > MAX_AUTHORS) {
707 cit_new.SetAuthors(cit_old.SetAuthors());
708 cit_old.ResetAuthors();
709 }
710
711 return true;
712 }
713
ExtractConsortiums(const CAuth_list::C_Names::TStd & names,CAuth_list::C_Names::TStr & extracted)714 size_t ExtractConsortiums(const CAuth_list::C_Names::TStd& names, CAuth_list::C_Names::TStr& extracted)
715 {
716 size_t num_of_names = 0;
717
718 for (auto& name: names)
719 {
720 const CAuthor& auth = *name;
721 if (auth.IsSetName() && auth.GetName().IsName()) {
722 ++num_of_names;
723 }
724 else if (auth.IsSetName() && auth.GetName().IsConsortium()) {
725
726 const string& cur_consortium = auth.GetName().GetConsortium();
727 extracted.push_back(cur_consortium);
728 }
729 }
730
731 extracted.sort([](const string& a, const string& b) { return NStr::CompareNocase(a, b) == -1; });
732
733 return num_of_names;
734 }
735
736
GetFirstTenNames(const CAuth_list::C_Names::TStd & names,list<CTempString> & res)737 void GetFirstTenNames(const CAuth_list::C_Names::TStd& names, list<CTempString>& res)
738 {
739 static const size_t MAX_EXTRACTED = 10;
740 size_t extracted = 0;
741
742 for (auto& name : names) {
743 if (name->IsSetName() && name->GetName().IsName() && name->GetName().GetName().IsSetLast()) {
744 res.push_back(name->GetName().GetName().GetLast());
745 ++extracted;
746
747 if (extracted == MAX_EXTRACTED) {
748 break;
749 }
750 }
751 }
752 }
753
754
TenAuthorsProcess(CCit_art & cit,CCit_art & new_cit,IMessageListener * err_log)755 bool TenAuthorsProcess(CCit_art& cit, CCit_art& new_cit, IMessageListener* err_log)
756 {
757 if (!new_cit.IsSetAuthors() || !new_cit.GetAuthors().IsSetNames()) {
758 if (cit.IsSetAuthors()) {
759 new_cit.SetAuthors(cit.SetAuthors());
760 cit.ResetAuthors();
761 }
762 return true;
763 }
764
765 if (!cit.IsSetAuthors() || !cit.GetAuthors().IsSetNames() ||
766 cit.GetAuthors().GetNames().Which() != new_cit.GetAuthors().GetNames().Which()) {
767 return true;
768 }
769
770 if (!cit.GetAuthors().GetNames().IsStd()) {
771 return TenAuthorsCompare(cit, new_cit);
772 }
773
774 CAuth_list::C_Names::TStr old_consortiums;
775 size_t num_names = ExtractConsortiums(cit.GetAuthors().GetNames().GetStd(), old_consortiums);
776
777 CAuth_list::C_Names::TStr new_consortiums;
778 size_t new_num_names = ExtractConsortiums(new_cit.GetAuthors().GetNames().GetStd(), new_consortiums);
779
780 if (!old_consortiums.empty()) {
781
782 string old_cons_list = NStr::Join(old_consortiums, ";");
783 if (new_consortiums.empty()) {
784
785 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, err_Reference_NoConsortAuthors,
786 "Publication as returned by MedArch lacks consortium authors of the original publication : \"" << old_cons_list << "\".");
787
788 for_each(old_consortiums.begin(), old_consortiums.end(),
789 [&new_cit](const string& consortium) {
790
791 CRef<CAuthor> auth(new CAuthor);
792 auth->SetName().SetConsortium(consortium);
793
794 new_cit.SetAuthors().SetNames().SetStd().push_front(auth);
795 });
796 }
797 else {
798
799 string new_cons_list = NStr::Join(new_consortiums, ";");
800 if (!NStr::EqualNocase(old_cons_list, new_cons_list)) {
801 ERR_POST_TO_LISTENER(err_log, eDiag_Warning, err_Reference, err_Reference_DiffConsortAuthors,
802 "Consortium author names differ. Original is \"" << old_cons_list << "\". MedArch's is \"" << new_cons_list << "\".");
803 }
804 }
805
806 if (num_names == 0) {
807 return true;
808 }
809 }
810
811 list<CTempString> new_author_names;
812 GetFirstTenNames(new_cit.GetAuthors().GetNames().GetStd(), new_author_names);
813 size_t match = 0;
814
815 for (auto& name: cit.GetAuthors().GetNames().GetStd())
816 {
817 const CAuthor& auth = *name;
818 if (auth.IsSetName() && auth.GetName().IsName() && auth.GetName().GetName().IsSetLast()) {
819
820 const string& last_name = auth.GetName().GetName().GetLast();
821 if (find_if(new_author_names.begin(), new_author_names.end(),
822 [&last_name](const CTempString& cur_name)
823 {
824 return NStr::EqualNocase(last_name, cur_name);
825 }) != new_author_names.end()) {
826
827 ++match;
828 }
829 }
830 }
831
832 size_t min_num_names = min(num_names, new_author_names.size());
833 if (min_num_names > MAX_MATCH_COEFF * match) {
834 return false;
835 }
836
837 bool replace_authors = new_num_names == 0;
838 if (!replace_authors && new_num_names < num_names) {
839 // Check the last author from PubMed. If it is "et al" - leave the old authors list
840 const CAuthor& last_author = *new_cit.GetAuthors().GetNames().GetStd().back();
841 if (last_author.IsSetName() && last_author.GetName().IsName()) {
842
843 const CName_std& name = last_author.GetName().GetName();
844 string last_name = name.IsSetLast() ? name.GetLast() : "",
845 initials = name.IsSetInitials() ? name.GetInitials() : "";
846
847 replace_authors = NStr::EqualNocase(last_name, "et") &&
848 NStr::EqualNocase(initials, "al");
849 }
850
851 // If the last author does not contain "et al", look at the amount of authors
852 // This is done according to the next document:
853 // ~cavanaug/WORK/MedArch/doc.medarch.4genbank.txt
854 //
855 // If the MedArchCitArt has zero Name-std Author.name ...
856 //
857 // Or if the InputCitArt has more than 10 Name - std Author.name while
858 // the MedArchCitArt has less than 12 ...
859 //
860 // Or if the InputCitArt has more than 25 Name - std Author.name while
861 // the MedArchCitArt has less than 27 ...
862 //
863 // Then free the Auth - list of the MedArchCitArt and replace it with
864 // the Auth - list of the InputCitArt, and **null out** the Auth - list
865 // of the MedArchCitArt .
866 if (!replace_authors)
867 {
868 static const int MIN_FIRST_AUTHORS_THRESHOLD_1995 = 10;
869 static const int MAX_FIRST_AUTHORS_THRESHOLD_1995 = 12;
870
871 static const int MIN_SECOND_AUTHORS_THRESHOLD_1999 = 25;
872 static const int MAX_SECOND_AUTHORS_THRESHOLD_1999 = 27;
873
874 replace_authors = (new_num_names < MAX_FIRST_AUTHORS_THRESHOLD_1995 && num_names > MIN_FIRST_AUTHORS_THRESHOLD_1995) ||
875 (new_num_names < MAX_SECOND_AUTHORS_THRESHOLD_1999 && num_names > MIN_SECOND_AUTHORS_THRESHOLD_1999);
876 }
877 }
878
879 if (replace_authors) {
880 new_cit.SetAuthors(cit.SetAuthors());
881 cit.ResetAuthors();
882 }
883
884 return true;
885 }
886
887
MergeNonPubmedPubIds(const CCit_art & cit_old,CCit_art & cit_new)888 void MergeNonPubmedPubIds(const CCit_art& cit_old, CCit_art& cit_new)
889 {
890 if (!cit_old.IsSetIds()) {
891 return;
892 }
893
894 const CArticleIdSet& old_ids = cit_old.GetIds();
895
896 for (auto& cur_id: old_ids.Get()) {
897
898 if (!cur_id->IsDoi() && !cur_id->IsOther()) {
899 continue;
900 }
901
902 bool found = false;
903 if (cit_new.IsSetIds()) {
904
905 auto& new_ids = cit_new.GetIds().Get();
906 found = find_if(new_ids.begin(), new_ids.end(),
907 [&cur_id](const CRef<CArticleId>& new_id)
908 {
909 if (cur_id->Which() != new_id->Which()) {
910 return false;
911 }
912
913 if (new_id->IsDoi()) {
914 return true;
915 }
916
917 bool res = cur_id->GetOther().IsSetDb() == new_id->GetOther().IsSetDb();
918 if (res && cur_id->GetOther().IsSetDb()) {
919 res = cur_id->GetOther().GetDb() == new_id->GetOther().GetDb();
920 }
921 return res;
922 }) != new_ids.end();
923 }
924
925 if (!found) {
926 cit_new.SetIds().Set().push_front(cur_id);
927 }
928 }
929 }
930
931
NeedToPropagateInJournal(const CCit_art & cit_art)932 bool NeedToPropagateInJournal(const CCit_art& cit_art)
933 {
934 if (!cit_art.IsSetFrom() || !cit_art.GetFrom().IsJournal() ||
935 !cit_art.GetFrom().GetJournal().IsSetTitle() || !cit_art.GetFrom().GetJournal().GetTitle().IsSet() ||
936 cit_art.GetFrom().GetJournal().GetTitle().Get().empty()) {
937 return true;
938 }
939
940 const CCit_jour& journal = cit_art.GetFrom().GetJournal();
941 if (!journal.IsSetImp()) {
942 return true;
943 }
944
945 if (!journal.GetImp().IsSetVolume() || !journal.GetImp().IsSetPages() || !journal.GetImp().IsSetDate()) {
946 return true;
947 }
948
949 return false;
950 }
951
952
PropagateInPress(bool inpress,CCit_art & cit_art)953 void PropagateInPress(bool inpress, CCit_art& cit_art)
954 {
955 if (!inpress)
956 return;
957
958 if (!cit_art.IsSetFrom() || !NeedToPropagateInJournal(cit_art)) {
959 return;
960 }
961
962 CImprint* imprint = nullptr;
963
964 switch (cit_art.GetFrom().Which()) {
965
966 case CCit_art::C_From::e_Journal:
967 if (cit_art.GetFrom().GetJournal().IsSetImp()) {
968 imprint = &cit_art.SetFrom().SetJournal().SetImp();
969 }
970 break;
971
972 case CCit_art::C_From::e_Book:
973 if (cit_art.GetFrom().GetBook().IsSetImp()) {
974 imprint = &cit_art.SetFrom().SetBook().SetImp();
975 }
976 break;
977
978 case CCit_art::C_From::e_Proc:
979 if (cit_art.GetFrom().GetProc().IsSetBook() && cit_art.GetFrom().GetProc().GetBook().IsSetImp()) {
980 imprint = &cit_art.SetFrom().SetProc().SetBook().SetImp();
981 }
982 break;
983
984 default:; // do nothing
985 }
986
987 if (imprint) {
988 imprint->SetPrepub(CImprint::ePrepub_in_press);
989 }
990 }
991
992 }
993
994 using namespace fix_pub;
995
FixPubEquiv(CPub_equiv & pub_equiv)996 void CPubFix::FixPubEquiv(CPub_equiv& pub_equiv)
997 {
998 CPub_equiv::Tdata muids,
999 pmids,
1000 medlines,
1001 others,
1002 cit_arts;
1003
1004 if (pub_equiv.IsSet()) {
1005 for (auto& pub: pub_equiv.Set())
1006 {
1007 if (pub->IsMuid()) {
1008 muids.push_back(pub);
1009 }
1010 else if (pub->IsPmid()) {
1011 pmids.push_back(pub);
1012 }
1013 else if (pub->IsArticle()) {
1014 if (IsFromBook(pub->GetArticle())) {
1015 others.push_back(pub);
1016 }
1017 else {
1018 cit_arts.push_back(pub);
1019 }
1020 }
1021 else if (pub->IsMedline()) {
1022 medlines.push_back(pub);
1023 }
1024 else {
1025 others.push_back(pub);
1026 }
1027 }
1028 }
1029
1030 auto& pub_list = pub_equiv.Set();
1031 pub_list.clear();
1032
1033 if ((!muids.empty() || !pmids.empty()) && !m_always_lookup) {
1034 // pmid or muid is present
1035 pub_list.splice(pub_list.end(), cit_arts);
1036 pub_list.splice(pub_list.end(), muids);
1037 pub_list.splice(pub_list.end(), pmids);
1038 pub_list.splice(pub_list.end(), medlines);
1039 pub_list.splice(pub_list.end(), others);
1040 return;
1041 }
1042
1043 pub_list.splice(pub_list.end(), others);
1044
1045 if (!medlines.empty())
1046 {
1047 if (medlines.size() > 1) {
1048 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_ref, "More than one Medline entry in Pub-equiv");
1049 medlines.resize(1);
1050 }
1051
1052 SplitMedlineEntry(medlines);
1053 pub_list.splice(pub_list.end(), medlines);
1054 }
1055
1056 TEntrezId oldpmid = ZERO_ENTREZ_ID;
1057 if (!pmids.empty()) {
1058
1059 oldpmid = pmids.front()->GetPmid();
1060
1061 // check if more than one
1062 for (auto& pub: pmids) {
1063 if (pub->GetPmid() != oldpmid) {
1064 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_pmid,
1065 "Two different pmids in Pub-equiv [" << oldpmid << "] [" << pub->GetPmid() << "]");
1066 }
1067 }
1068 pmids.resize(1);
1069 }
1070
1071 TEntrezId oldmuid = ZERO_ENTREZ_ID;
1072 if (!muids.empty()) {
1073
1074 oldmuid = muids.front()->GetMuid();
1075
1076 // check if more than one
1077 for (auto& pub : muids) {
1078 if (pub->GetMuid() != oldmuid) {
1079 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_pmid,
1080 "Two different muids in Pub-equiv [" << oldmuid << "] [" << pub->GetMuid() << "]");
1081 }
1082 }
1083 muids.resize(1);
1084 }
1085
1086 if (!cit_arts.empty()) {
1087 if (cit_arts.size() > 1) {
1088 // ditch extras
1089 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_Multiple_ref, "More than one Cit-art in Pub-equiv");
1090 cit_arts.resize(1);
1091 }
1092
1093 CCit_art* cit_art = &cit_arts.front()->SetArticle();
1094 bool inpress = IsInpress(*cit_art);
1095
1096 CRef<CPub> new_pub(new CPub);
1097 new_pub->SetArticle(*cit_art);
1098
1099 TEntrezId pmid = ZERO_ENTREZ_ID;
1100 try {
1101 CMLAClient mla;
1102 pmid = ENTREZ_ID_FROM(int, mla.AskCitmatchpmid(*new_pub));
1103 }
1104 catch (exception &) {
1105 // pmid == 0
1106 }
1107
1108 if ( pmid != ZERO_ENTREZ_ID ) {
1109
1110 PrintPub(*cit_art, true, false, ENTREZ_ID_TO(long, pmid), m_err_log);
1111
1112 if (oldpmid > ZERO_ENTREZ_ID && oldpmid != pmid) {
1113 // already had a pmid
1114 ERR_POST_TO_LISTENER(m_err_log, eDiag_Error, err_Reference, err_Reference_PmidMissmatch,
1115 "OldPMID=" << oldpmid << " doesn't match lookup (" << pmid << "). Keeping lookup.");
1116 }
1117
1118 bool set_pmid = true;
1119 if (m_replace_cit) {
1120
1121 CRef<CCit_art> new_cit_art = FetchPubPmId(pmid);
1122
1123 if (new_cit_art.NotEmpty()) {
1124
1125 bool new_cit_is_valid(false);
1126 if (CAuthListValidator::enabled) {
1127 CAuthListValidator::EOutcome outcome = m_authlist_validator.validate(*cit_art, *new_cit_art);
1128 switch (outcome) {
1129 case CAuthListValidator::eAccept_pubmed:
1130 new_cit_is_valid = true;
1131 break;
1132 case CAuthListValidator::eKeep_genbank:
1133 new_cit_art->SetAuthors(cit_art->SetAuthors());
1134 cit_art->ResetAuthors();
1135 new_cit_is_valid = true;
1136 break;
1137 case CAuthListValidator::eFailed_validation:
1138 new_cit_is_valid = false;
1139 break;
1140 default:
1141 throw logic_error("Invalid outcome returned by CAuthListValidator::validate(): " + std::to_string(outcome));
1142 }
1143 }
1144 else {
1145 new_cit_is_valid = TenAuthorsProcess(*cit_art, *new_cit_art, m_err_log);
1146 }
1147
1148 if (new_cit_is_valid) {
1149 if (pmids.empty()) {
1150 CRef<CPub> pmid_pub(new CPub);
1151 pmids.push_back(pmid_pub);
1152 }
1153
1154 pmids.front()->SetPmid().Set(pmid);
1155 pub_list.splice(pub_list.end(), pmids);
1156
1157 CRef<CPub> cit_pub(new CPub);
1158 cit_pub->SetArticle(*new_cit_art);
1159 pub_list.push_back(cit_pub);
1160
1161 if (m_merge_ids) {
1162 MergeNonPubmedPubIds(*cit_art, cit_pub->SetArticle());
1163 }
1164
1165 cit_arts.clear();
1166 cit_arts.push_back(cit_pub);
1167 cit_art = new_cit_art;
1168 }
1169 else {
1170 pmids.clear();
1171
1172 PrintPub(*cit_art, false, true, ENTREZ_ID_TO(long, pmid), m_err_log);
1173 pub_list.splice(pub_list.end(), cit_arts);
1174 }
1175
1176 set_pmid = false;
1177 }
1178 else {
1179 ERR_POST_TO_LISTENER(m_err_log, eDiag_Error, err_Reference, err_Reference_FailedToGetPub,
1180 "Failed to get pub from MedArch server for pmid = " << pmid << ". Input one is preserved.");
1181 }
1182 }
1183
1184 if (set_pmid) {
1185 if (pmids.empty()) {
1186 CRef<CPub> pmid_pub(new CPub);
1187 pmids.push_back(pmid_pub);
1188 }
1189
1190 pmids.front()->SetPmid().Set(pmid);
1191 pub_list.splice(pub_list.end(), pmids);
1192
1193 MedlineToISO(*cit_art);
1194
1195 pub_list.splice(pub_list.end(), cit_arts);
1196 }
1197
1198 PropagateInPress(inpress, *cit_art);
1199 return;
1200 }
1201
1202 PrintPub(*cit_art, false, false, ENTREZ_ID_TO(long, oldpmid), m_err_log);
1203 PropagateInPress(inpress, *cit_art);
1204 pub_list.splice(pub_list.end(), cit_arts);
1205
1206 return;
1207 }
1208
1209 if (oldpmid != ZERO_ENTREZ_ID) {
1210 // have a pmid but no cit-art
1211
1212 CRef<CCit_art> new_cit_art = FetchPubPmId(oldpmid);
1213
1214 if (new_cit_art.NotEmpty()) {
1215
1216 pub_list.splice(pub_list.end(), pmids);
1217
1218 if (m_replace_cit) {
1219 MedlineToISO(*new_cit_art);
1220 CRef<CPub> cit_pub(new CPub);
1221 cit_pub->SetArticle(*new_cit_art);
1222 pub_list.push_back(cit_pub);
1223 }
1224
1225 return;
1226 }
1227 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_Reference, err_Reference_No_reference,
1228 "Cant find article for pmid [" << oldpmid << "]");
1229 }
1230
1231 if (oldpmid > ZERO_ENTREZ_ID) {
1232 pub_list.splice(pub_list.end(), pmids);
1233 }
1234 else if (oldmuid > ZERO_ENTREZ_ID) {
1235 pub_list.splice(pub_list.end(), muids);
1236 }
1237 }
1238
1239
1240 // Tries to make any Pub into muid / cit - art
FixPub(CPub & pub)1241 void CPubFix::FixPub(CPub& pub)
1242 {
1243 switch (pub.Which()) {
1244
1245 case CPub::e_Medline:
1246 {
1247 CRef<CPub_equiv> pub_equiv(new CPub_equiv);
1248 pub_equiv->Set().push_back(CRef<CPub>(new CPub));
1249 pub_equiv->Set().front()->Assign(pub);
1250
1251 SplitMedlineEntry(pub_equiv->Set());
1252 pub.SetEquiv().Assign(*pub_equiv);
1253 }
1254 break;
1255
1256 case CPub::e_Article:
1257 {
1258 CCit_art& cit_art = pub.SetArticle();
1259 if (cit_art.IsSetFrom() && cit_art.GetFrom().IsBook()) {
1260 return;
1261 }
1262
1263 TEntrezId pmid = ZERO_ENTREZ_ID;
1264 try {
1265 CMLAClient mla;
1266 pmid = ENTREZ_ID_FROM(int, mla.AskCitmatchpmid(pub));
1267 }
1268 catch (exception &) {
1269 // pmid == 0;
1270 }
1271
1272 if (pmid > ZERO_ENTREZ_ID) {
1273 PrintPub(cit_art, true, false, ENTREZ_ID_TO(long, pmid), m_err_log);
1274 if (m_replace_cit) {
1275 CRef<CCit_art> new_cit_art = FetchPubPmId(pmid);
1276
1277 if (new_cit_art.NotEmpty()) {
1278 if (TenAuthorsProcess(cit_art, *new_cit_art, m_err_log)) {
1279
1280 if (m_merge_ids) {
1281 MergeNonPubmedPubIds(*new_cit_art, cit_art);
1282 }
1283
1284 CRef<CPub> new_pub(new CPub);
1285 new_pub->SetArticle(*new_cit_art);
1286 pub.SetEquiv().Set().push_back(new_pub);
1287
1288 new_pub.Reset(new CPub);
1289 new_pub->SetPmid().Set(pmid);
1290 pub.SetEquiv().Set().push_back(new_pub);
1291 }
1292 else {
1293 PrintPub(cit_art, false, true, ENTREZ_ID_TO(long, pmid), m_err_log);
1294 MedlineToISO(cit_art);
1295 }
1296 }
1297 }
1298 else {
1299 PrintPub(cit_art, false, false, ENTREZ_ID_TO(long, pmid), m_err_log);
1300 MedlineToISO(cit_art);
1301 }
1302 }
1303 }
1304 break;
1305
1306 case CPub::e_Equiv:
1307 FixPubEquiv(pub.SetEquiv());
1308 break;
1309
1310 default:; // do nothing
1311 }
1312 }
1313
FetchPubPmId(TEntrezId pmid)1314 CRef<CCit_art> CPubFix::FetchPubPmId(TEntrezId pmid)
1315 {
1316 CRef<CCit_art> cit_art;
1317 if (pmid < ZERO_ENTREZ_ID)
1318 return cit_art;
1319
1320 CRef<CPub> pub;
1321 try {
1322 CMLAClient mla;
1323 pub = mla.AskGetpubpmid(CPubMedId(pmid));
1324 }
1325 catch (exception &) {
1326 pub.Reset();
1327 }
1328
1329 if (pub.NotEmpty() && pub->IsArticle()) {
1330 cit_art.Reset(new CCit_art);
1331 cit_art->Assign(pub->GetArticle());
1332
1333 MedlineToISO(*cit_art);
1334 }
1335
1336 return cit_art;
1337 }
1338
1339 bool CAuthListValidator::enabled = true; // Verified in ID-6550, so set to use it by default
1340 // Setting it to false would lead to a few bugs
1341 bool CAuthListValidator::configured = false;
1342 double CAuthListValidator::cfg_matched_to_min = 0.3333;
1343 double CAuthListValidator::cfg_removed_to_gb = 0.3333;
Configure(const CNcbiRegistry & cfg,const string & section)1344 void CAuthListValidator::Configure(const CNcbiRegistry& cfg, const string& section)
1345 {
1346 enabled = cfg.GetBool(section, "enabled", enabled);
1347 cfg_matched_to_min = cfg.GetDouble(section, "matched_to_min", cfg_matched_to_min);
1348 cfg_removed_to_gb = cfg.GetDouble(section, "removed_to_gb", cfg_removed_to_gb);
1349 configured = true;
1350 }
1351
CAuthListValidator(IMessageListener * err_log)1352 CAuthListValidator::CAuthListValidator(IMessageListener* err_log)
1353 : outcome(eNotSet), pub_year(0), reported_limit("not initialized"), m_err_log(err_log)
1354 {
1355 if (! configured) {
1356 Configure(CNcbiApplication::Instance()->GetConfig(), "auth_list_validator");
1357 }
1358 }
1359
validate(const CCit_art & gb_art,const CCit_art & pm_art)1360 CAuthListValidator::EOutcome CAuthListValidator::validate(const CCit_art& gb_art, const CCit_art& pm_art)
1361 {
1362 outcome = eNotSet;
1363 pub_year = 0;
1364 pub_year = pm_art.GetFrom().GetJournal().GetImp().GetDate().GetStd().GetYear();
1365 if (pub_year < 1900 || pub_year > 3000) {
1366 throw logic_error("Publication from PubMed has invalid year: " + std::to_string(pub_year));
1367 }
1368 gb_type = CAuth_list::C_Names::SelectionName(gb_art.GetAuthors().GetNames().Which());
1369 get_lastnames(gb_art.GetAuthors(), removed, gb_auth_string);
1370 pm_type = CAuth_list::C_Names::SelectionName(pm_art.GetAuthors().GetNames().Which());
1371 get_lastnames(pm_art.GetAuthors(), added, pm_auth_string);
1372 matched.clear();
1373 compare_lastnames();
1374 actual_matched_to_min = double(cnt_matched) / cnt_min;
1375 actual_removed_to_gb = double(cnt_removed) / cnt_gb;
1376 if (actual_removed_to_gb > cfg_removed_to_gb) {
1377 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_AuthList, err_AuthList_SignificantDrop,
1378 "Too many authors removed (" << cnt_removed << ") compared to total Genbank authors (" << cnt_gb << ")");
1379 }
1380 // determine outcome according to ID-6514 (see fix_pub.hpp)
1381 if (pub_year > 1999) {
1382 reported_limit = "Unlimited";
1383 outcome = eAccept_pubmed;
1384 }
1385 else if (pub_year > 1995) {
1386 reported_limit = "25 authors";
1387 if (cnt_gb > 25) {
1388 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_AuthList, err_AuthList_PreserveGB,
1389 "Preserving original " << cnt_gb << " GB authors, ignoring " << cnt_pm << " PubMed authors "
1390 << "(PubMed limit was " << reported_limit << " in pub.year " << pub_year << ")");
1391 outcome = eKeep_genbank;
1392 }
1393 else {
1394 outcome = eAccept_pubmed;
1395 }
1396 }
1397 else { // pub_year < 1996
1398 reported_limit = "10 authors";
1399 if (cnt_gb > 10) {
1400 ERR_POST_TO_LISTENER(m_err_log, eDiag_Warning, err_AuthList, err_AuthList_PreserveGB,
1401 "Preserving original " << cnt_gb << " GB authors, ignoring " << cnt_pm << " PubMed authors "
1402 << "(PubMed limit was " << reported_limit << " in pub.year " << pub_year << ")");
1403 outcome = eKeep_genbank;
1404 }
1405 else {
1406 outcome = eAccept_pubmed;
1407 }
1408 }
1409 // check minimum required # of matching authors
1410 if (actual_matched_to_min < cfg_matched_to_min) {
1411 ERR_POST_TO_LISTENER(m_err_log, eDiag_Error, err_AuthList, err_AuthList_LowMatch,
1412 "Only " << cnt_matched << " authors matched between " << cnt_gb << " Genbank and "
1413 << cnt_pm << " PubMed. Match/Min ratio " << fixed << setprecision(2) << actual_matched_to_min
1414 << " is below threshold " << fixed << setprecision(2) << cfg_matched_to_min);
1415 outcome = eFailed_validation;
1416 }
1417 return outcome;
1418 }
1419
DebugDump(CNcbiOstream & out) const1420 void CAuthListValidator::DebugDump(CNcbiOstream& out) const
1421 {
1422 out << "\n--- Debug Dump of CAuthListValidator object ---\n";
1423 out << "pub_year: " << pub_year << "\n";
1424 out << "PubMed Auth-list limit in " << pub_year << ": " << reported_limit << "\n";
1425 out << "Configured ratio 'matched' to 'min(gb,pm)': " << cfg_matched_to_min
1426 << "; actual: " << actual_matched_to_min << "\n";
1427 out << "Configured ratio 'removed' to 'gb': " << cfg_removed_to_gb
1428 << "; actual: " << actual_removed_to_gb << "\n";
1429 out << "GB author list type: " << gb_type << "; # of entries: " << cnt_gb << "\n";
1430 out << "PM author list type: " << pm_type << "; # of entries: " << cnt_pm << "\n";
1431 dumplist("Matched", matched, out);
1432 dumplist("Added", added, out);
1433 dumplist("Removed", removed, out);
1434 const char* outcome_names[] = {"NotSet", "Failed_validation", "Accept_pubmed", "Keep_genbank"};
1435 out << "Outcome reported: " << outcome_names[outcome] << "(" << outcome << ")\n";
1436 out << "--- End of Debug Dump of CAuthListValidator object ---\n\n";
1437 }
1438
dumplist(const char * hdr,const list<string> & lst,CNcbiOstream & out) const1439 void CAuthListValidator::dumplist(const char* hdr, const list<string>& lst, CNcbiOstream& out) const
1440 {
1441 out << lst.size() << " " << hdr << " authors:\n";
1442 for (const auto& a : lst)
1443 out << " " << a << "\n";
1444 }
1445
compare_lastnames()1446 void CAuthListValidator::compare_lastnames()
1447 {
1448 auto gbit = removed.begin();
1449 while (gbit != removed.end()) {
1450 list<string>::iterator gbnext(gbit);
1451 ++gbnext;
1452 list<string>::iterator pmit = std::find(added.begin(), added.end(), *gbit);
1453 if (pmit != added.end()) {
1454 matched.push_back(*gbit);
1455 removed.erase(gbit++);
1456 added.erase(pmit);
1457 }
1458 gbit = gbnext;
1459 }
1460 cnt_matched = matched.size();
1461 cnt_removed = removed.size();
1462 cnt_added = added.size();
1463 cnt_gb = cnt_matched + cnt_removed;
1464 cnt_pm = cnt_matched + cnt_added;
1465 cnt_min = min(cnt_gb, cnt_pm);
1466 }
1467
1468
get_lastnames(const CAuth_list & authors,list<string> & lastnames,string & auth_string)1469 void CAuthListValidator::get_lastnames(const CAuth_list& authors, list<string>& lastnames, string& auth_string)
1470 {
1471 lastnames.clear();
1472 switch (authors.GetNames().Which()) {
1473 case CAuth_list::C_Names::e_Std:
1474 get_lastnames(authors.GetNames().GetStd(), lastnames);
1475 break;
1476 case CAuth_list::C_Names::e_Ml:
1477 {{
1478 CRef< CAuth_list > authlist_std;
1479 authlist_std->Assign(authors);
1480 authlist_std->ConvertMlToStandard();
1481 get_lastnames(authlist_std->GetNames().GetStd(), lastnames);
1482 }}
1483 break;
1484 case CAuth_list::C_Names::e_Str:
1485 get_lastnames(authors.GetNames().GetStr(), lastnames);
1486 break;
1487 default:
1488 throw logic_error("Unexpected CAuth_list::C_Name choice: " + CAuth_list::C_Names::SelectionName(authors.GetNames().Which()));
1489 }
1490 auth_string = NStr::Join(lastnames, "; ");
1491 }
1492
get_lastnames(const CAuth_list::C_Names::TStd & authors,list<string> & lastnames)1493 void CAuthListValidator::get_lastnames(const CAuth_list::C_Names::TStd& authors, list<string>& lastnames)
1494 {
1495 for (auto& name : authors) {
1496 if (name->IsSetName() && name->GetName().IsName() && name->GetName().GetName().IsSetLast()) {
1497 string lname(name->GetName().GetName().GetLast());
1498 lastnames.push_back(NStr::ToLower(lname));
1499 }
1500 }
1501 }
1502
get_lastnames(const CAuth_list::C_Names::TStr & authors,list<string> & lastnames)1503 void CAuthListValidator::get_lastnames(const CAuth_list::C_Names::TStr& authors, list<string>& lastnames)
1504 {
1505 const char* alpha = "abcdefghijklmnopqrstuvwxyz";
1506 for (auto auth : authors) {
1507 size_t eow = NStr::ToLower(auth).find_first_not_of(alpha);
1508 lastnames.push_back(auth.substr(0, eow));
1509 }
1510 }
1511
1512 END_SCOPE(edit)
1513 END_SCOPE(objects)
1514 END_NCBI_SCOPE
1515