1 /* $Id: cleanup_pub.cpp 632626 2021-06-03 17:38:42Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Colleen Bollin
27  *
28  * File Description:
29  *   Code for cleaning up publications
30  *
31  */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
35 
36 #include <objects/biblio/Affil.hpp>
37 #include <objects/biblio/ArticleId.hpp>
38 #include <objects/biblio/ArticleIdSet.hpp>
39 #include <objects/biblio/Author.hpp>
40 #include <objects/biblio/Auth_list.hpp>
41 #include <objects/biblio/Cit_art.hpp>
42 #include <objects/biblio/Imprint.hpp>
43 #include <objects/general/Name_std.hpp>
44 #include <objects/general/Person_id.hpp>
45 
46 #include <objects/seq/Pubdesc.hpp>
47 #include <objects/pub/Pub_equiv.hpp>
48 
49 #include <objtools/cleanup/cleanup.hpp>
50 #include <objtools/cleanup/cleanup_pub.hpp>
51 #include "cleanup_utils.hpp"
52 #include <objmgr/util/objutil.hpp>
53 
54 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)55 BEGIN_SCOPE(objects)
56 
57 
58 bool CCleanupPub::x_CleanPubdescComment(string& str)
59 {
60     bool any_change = false;
61     if (CleanDoubleQuote(str)) {
62         any_change = true;
63     }
64     if (CleanVisString(str)) {
65         any_change = true;
66     }
67     return any_change;
68 }
69 
CleanPubdesc(CPubdesc & pubdesc,bool strip_serial)70 bool CCleanupPub::CleanPubdesc(CPubdesc& pubdesc, bool strip_serial)
71 {
72     bool any_change = false;
73     if (pubdesc.IsSetComment()) {
74         string& comment = pubdesc.SetComment();
75         any_change |= x_CleanPubdescComment(comment);
76         if (comment.empty()) {
77             pubdesc.ResetComment();
78             any_change = true;
79         }
80     }
81 
82     if (pubdesc.IsSetPub()) {
83         CPubEquivCleaner cleaner(pubdesc.SetPub());
84         bool fix_initials = CPubEquivCleaner::ShouldWeFixInitials(pubdesc.GetPub());
85         if (cleaner.Clean(fix_initials, strip_serial)) {
86             any_change = true;
87         }
88     }
89     return any_change;
90 }
91 
92 
s_PubPriority(CPub::E_Choice val)93 static size_t s_PubPriority(CPub::E_Choice val)
94 {
95     size_t priority = 0;
96     switch (val) {
97     case CPub::e_not_set:
98         priority = 0;
99         break;
100     case CPub::e_Gen:
101         priority = 3;
102         break;
103     case CPub::e_Sub:
104         priority = 4;
105         break;
106     case CPub::e_Medline:
107         priority = 13;
108         break;
109     case CPub::e_Muid:
110         priority = 2;
111         break;
112     case CPub::e_Article:
113         priority = 5;
114         break;
115     case CPub::e_Journal:
116         priority = 6;
117         break;
118     case CPub::e_Book:
119         priority = 7;
120         break;
121     case CPub::e_Proc:
122         priority = 8;
123         break;
124     case CPub::e_Patent:
125         priority = 9;
126         break;
127     case CPub::e_Pat_id:
128         priority = 10;
129         break;
130     case CPub::e_Man:
131         priority = 11;
132         break;
133     case CPub::e_Equiv:
134         priority = 12;
135         break;
136     case CPub::e_Pmid:
137         priority = 1;
138         break;
139     }
140     return priority;
141 }
142 
143 inline
144 static
s_PubWhichCompare(CRef<CPub> pub1,CRef<CPub> pub2)145 bool s_PubWhichCompare(CRef<CPub> pub1, CRef<CPub> pub2) {
146     size_t pr1 = s_PubPriority(pub1->Which());
147     size_t pr2 = s_PubPriority(pub2->Which());
148     return (pr1 < pr2);
149 }
150 
151 
152 struct SPMIDMatch {
153     const CPubMedId& m_ID;
154 
operator ()SPMIDMatch155     bool operator()(CRef< CArticleId > other_id)
156     {
157         return (other_id->IsPubmed() && other_id->GetPubmed() == m_ID);
158     }
159 };
160 
RemoveDuplicatePubMedArticleIds(CArticleIdSet::Tdata & id_set)161 void RemoveDuplicatePubMedArticleIds(CArticleIdSet::Tdata& id_set)
162 {
163     auto it = id_set.begin();
164     while (it != id_set.end()) {
165         while (it != id_set.end() && !(*it)->IsPubmed()) {
166             ++it;
167         }
168         if (it != id_set.end()) {
169             auto it2 = it;
170             ++it2;
171             SPMIDMatch matcher{ (*it)->GetPubmed() };
172             id_set.erase(std::remove_if(it2, id_set.end(), matcher), id_set.end());
173             ++it;
174         }
175     }
176 
177 }
178 
Clean(bool fix_initials,bool strip_serial)179 bool CPubEquivCleaner::Clean(bool fix_initials, bool strip_serial)
180 {
181     bool change = false;
182 
183     if (!m_Equiv.IsSet()) {
184         return change;
185     }
186 
187     if (s_Flatten(m_Equiv)) {
188         change = true;
189     }
190 
191     // we keep the last of these because we might transfer one
192     // to the other as necessary to fill in gaps.
193     TEntrezId last_pmid = ZERO_ENTREZ_ID;
194     TEntrezId last_article_pubmed_id = ZERO_ENTREZ_ID; // the last from a journal
195     CRef<CCit_art> last_article;
196 
197     auto& pe_set = m_Equiv.Set();
198 
199     pe_set.sort(s_PubWhichCompare);
200 
201     auto it = pe_set.begin();
202     while (it != pe_set.end()) {
203         CPub &pub = **it;
204 
205         CRef<CPubCleaner> cleaner = PubCleanerFactory(pub);
206         if (cleaner) {
207             if (cleaner->Clean(fix_initials, strip_serial)) {
208                 change = true;
209             }
210             if (cleaner->IsEmpty()) {
211                 it = pe_set.erase(it);
212                 continue;
213             }
214         }
215 
216         // storing these so at the end we'll know the last values
217         if (pub.IsPmid()) {
218             last_pmid = pub.GetPmid().Get();
219         }
220         if (pub.IsArticle()) {
221             last_article.Reset(&pub.SetArticle());
222             if (last_article->IsSetIds()) {
223                 auto& ids = last_article->SetIds().Set();
224                 size_t old_size = ids.size();
225                 RemoveDuplicatePubMedArticleIds(last_article->SetIds());
226                 change = (ids.size() != old_size);
227                 // find last article pubmed_id
228                 auto id_it = ids.rbegin();
229                 while (id_it != ids.rend()) {
230                     if ((*id_it)->IsPubmed()) {
231                         last_article_pubmed_id = (*id_it)->GetPubmed();
232                         break;
233                     }
234                     ++id_it;
235                 }
236             }
237         }
238         ++it;
239     }
240 
241     // Now, we might have to transfer data to fill in missing information
242     if (last_pmid == ZERO_ENTREZ_ID && last_article_pubmed_id > ZERO_ENTREZ_ID) {
243         CRef<CPub> new_pub(new CPub);
244         new_pub->SetPmid().Set(last_article_pubmed_id);
245         m_Equiv.Set().insert(m_Equiv.Set().begin(), new_pub);
246         change = true;
247     }
248     else if (last_pmid > ZERO_ENTREZ_ID && last_article_pubmed_id == ZERO_ENTREZ_ID && last_article) {
249         CRef<CArticleId> new_article_id(new CArticleId);
250         new_article_id->SetPubmed().Set(last_pmid);
251         last_article->SetIds().Set().push_back(new_article_id);
252         change = true;
253     }
254     return change;
255 }
256 
257 
IsEmpty()258 bool CPubEquivCleaner::IsEmpty()
259 {
260     return !m_Equiv.IsSet() || m_Equiv.Get().empty();
261 }
262 
ShouldWeFixInitials(const CPub_equiv & equiv)263 bool CPubEquivCleaner::ShouldWeFixInitials(const CPub_equiv& equiv)
264 {
265     if (!equiv.IsSet()) {
266         return false;
267     }
268 #if 0
269     bool has_id = false,
270         has_art = false;
271 
272     for (auto it : equiv.Get()) {
273         if ((it->IsPmid() && it->GetPmid() > 0) ||
274             (it->IsMuid() && it->GetMuid() > 0)) {
275             has_id = true;
276         }
277         else if (it->IsArticle()) {
278             has_art = true;
279         }
280     }
281     // return !(has_art  &&  has_id);
282 #endif
283     return true;
284 }
285 
286 
s_Flatten(CPub_equiv & pub_equiv)287 bool CPubEquivCleaner::s_Flatten(CPub_equiv& pub_equiv)
288 {
289     bool any_change = false;
290     CPub_equiv::Tdata& data = pub_equiv.Set();
291 
292     auto it = data.begin();
293     while (it != data.end()) {
294         if ((*it)->IsEquiv()) {
295             CPub_equiv& sub_equiv = (*it)->SetEquiv();
296             s_Flatten(sub_equiv);
297             copy(sub_equiv.Set().begin(), sub_equiv.Set().end(), back_inserter(data));
298             it = data.erase(it);
299             any_change = true;
300         }
301         else {
302             ++it;
303         }
304     }
305     return any_change;
306 }
307 
308 
309 
310 
311 
312 
313 
PubCleanerFactory(CPub & pub)314 CRef<CPubCleaner> PubCleanerFactory(CPub& pub)
315 {
316     switch (pub.Which()) {
317     case CPub::e_Gen:
318         return CRef<CPubCleaner>(new CCitGenCleaner(pub.SetGen()));
319         break;
320     case CPub::e_Equiv:
321         return CRef<CPubCleaner>(new CPubEquivCleaner(pub.SetEquiv()));
322         break;
323     case CPub::e_Sub:
324         return CRef<CPubCleaner>(new CCitSubCleaner(pub.SetSub()));
325         break;
326     case CPub::e_Article:
327         return CRef<CPubCleaner>(new CCitArtCleaner(pub.SetArticle()));
328         break;
329     case CPub::e_Journal:
330         return CRef<CPubCleaner>(new CCitJourCleaner(pub.SetJournal()));
331         break;
332     case CPub::e_Book:
333         return CRef<CPubCleaner>(new CCitBookCleaner(pub.SetBook()));
334         break;
335     case CPub::e_Proc:
336         return CRef<CPubCleaner>(new CCitProcCleaner(pub.SetProc()));
337         break;
338     case CPub::e_Patent:
339         return CRef<CPubCleaner>(new CCitPatCleaner(pub.SetPatent()));
340         break;
341     case CPub::e_Man:
342         return CRef<CPubCleaner>(new CCitLetCleaner(pub.SetMan()));
343         break;
344     case CPub::e_Medline:
345         return CRef<CPubCleaner>(new CMedlineEntryCleaner(pub.SetMedline()));
346         break;
347     default:
348         return CRef<CPubCleaner>(NULL);
349     }
350 }
351 
352 
Clean(bool fix_initials,bool strip_serial)353 bool CCitGenCleaner::Clean(bool fix_initials, bool strip_serial)
354 {
355     bool rval = false;
356     if (m_Gen.IsSetAuthors()) {
357         if (CCleanup::CleanupAuthList(m_Gen.SetAuthors(), fix_initials)) {
358             rval = true;
359         }
360     }
361     if (m_Gen.IsSetCit()) {
362         CCit_gen::TCit& cit = m_Gen.SetCit();
363         if (NStr::StartsWith(cit, "unpublished", NStr::eNocase) && cit[0] != 'U') {
364             cit[0] = 'U';
365             rval = true;
366         }
367         if (!m_Gen.IsSetJournal()
368             && (m_Gen.IsSetVolume() || m_Gen.IsSetPages() || m_Gen.IsSetIssue()))
369         {
370             m_Gen.ResetVolume();
371             m_Gen.ResetPages();
372             m_Gen.ResetIssue();
373             rval = true;
374         }
375         const size_t old_cit_size = cit.size();
376         NStr::TruncateSpacesInPlace(cit);
377         if (old_cit_size != cit.size()) {
378             rval = true;
379         }
380     }
381     if (m_Gen.IsSetPages()) {
382         if (RemoveSpaces(m_Gen.SetPages())) {
383             rval = true;
384         }
385     }
386 
387     // title strstripspaces (see 8728 in sqnutil1.c, Mar 11, 2011)
388     if (m_Gen.IsSetTitle() && StripSpaces(m_Gen.SetTitle())) {
389         rval = true;
390     }
391 
392     if (strip_serial && m_Gen.IsSetSerial_number()) {
393         m_Gen.ResetSerial_number();
394         rval = true;
395     }
396 
397     // erase if the Cit-gen is now entirely blank
398     return rval;
399 }
400 
401 
IsEmpty()402 bool CCitGenCleaner::IsEmpty()
403 {
404     return (!m_Gen.IsSetCit()) &&
405         !m_Gen.IsSetAuthors() &&
406         (!m_Gen.IsSetMuid() || m_Gen.GetMuid() <= ZERO_ENTREZ_ID) &&
407         !m_Gen.IsSetJournal() &&
408         (!m_Gen.IsSetVolume() || m_Gen.GetVolume().empty()) &&
409         (!m_Gen.IsSetIssue() || m_Gen.GetIssue().empty()) &&
410         (!m_Gen.IsSetPages() || m_Gen.GetPages().empty()) &&
411         !m_Gen.IsSetDate() &&
412         (!m_Gen.IsSetSerial_number() || m_Gen.GetSerial_number() <= 0) &&
413         (!m_Gen.IsSetTitle() || m_Gen.GetTitle().empty()) &&
414         (!m_Gen.IsSetPmid() || m_Gen.GetPmid().Get() <= ZERO_ENTREZ_ID);
415 }
416 
417 
Clean(bool fix_initials,bool strip_serial)418 bool CCitSubCleaner::Clean(bool fix_initials, bool strip_serial)
419 {
420     bool any_change = false;
421 
422     if (m_Sub.IsSetAuthors()) {
423         auto& authors = m_Sub.SetAuthors();
424         if (CCleanup::CleanupAuthList(authors, fix_initials)) {
425             any_change = true;
426         }
427         if (!authors.IsSetAffil() && m_Sub.IsSetImp()) {
428             auto& imp = m_Sub.SetImp();
429             if (imp.IsSetPub()) {
430                 authors.SetAffil(imp.SetPub());
431                 imp.ResetPub();
432                 any_change = true;
433             }
434         }
435         if (authors.IsSetAffil()) {
436             auto& affil = authors.SetAffil();
437             if (affil.IsStr()) {
438                 string &str = affil.SetStr();
439                 static const string& kBadAffil1 = "to the DDBJ/EMBL/GenBank databases";
440                 static const string& kBadAffil2 = "to the INSDC databases";
441                 if (NStr::StartsWith(str, kBadAffil1)) {
442                     str = str.substr(kBadAffil1.length());
443                     NStr::TrimPrefixInPlace(str, ".");
444                     any_change = true;
445                 }
446                 if (NStr::StartsWith(str, kBadAffil2)) {
447                     str = str.substr(kBadAffil2.length());
448                     NStr::TrimPrefixInPlace(str, ".");
449                     any_change = true;
450                 }
451 
452                 if (CCleanup::CleanupAffil(affil)) {
453                     any_change = true;
454                 }
455                 if (CCleanup::IsEmpty(affil)) {
456                     authors.ResetAffil();
457                     any_change = true;
458                 }
459             }
460 
461         }
462     }
463     if (m_Sub.IsSetImp() && !m_Sub.IsSetDate()) {
464         auto& imp = m_Sub.SetImp();
465         if (imp.IsSetDate()) {
466             m_Sub.SetDate().Assign(imp.GetDate());
467             m_Sub.ResetImp();
468         }
469         any_change = true;
470     }
471 
472     return any_change;
473 }
474 
475 
IsEmpty()476 bool CCitSubCleaner::IsEmpty()
477 {
478     return false;
479 }
480 
481 
Clean(bool fix_initials,bool strip_serial)482 bool CCitArtCleaner::Clean(bool fix_initials, bool strip_serial)
483 {
484     bool change = false;
485     if (m_Art.IsSetAuthors()) {
486         if (CCleanup::CleanupAuthList(m_Art.SetAuthors(), fix_initials)) {
487             change = true;
488         }
489     }
490     if (m_Art.IsSetFrom()) {
491         auto& from = m_Art.SetFrom();
492         if (from.IsBook()) {
493             CCitBookCleaner cleaner(from.SetBook());
494             change |= cleaner.Clean(fix_initials, strip_serial);
495         } else if (from.IsProc()) {
496             CCitProcCleaner cleaner(from.SetProc());
497             change |= cleaner.Clean(fix_initials, strip_serial);
498         } else if (from.IsJournal()) {
499             CCitJourCleaner cleaner(from.SetJournal());
500             change |= cleaner.Clean(fix_initials, strip_serial);
501         }
502     }
503 
504     return change;
505 }
506 
507 
Clean(bool fix_initials,bool strip_serial)508 bool CCitBookCleaner::Clean(bool fix_initials, bool strip_serial)
509 {
510     bool change = false;
511     if (m_Book.IsSetAuthors() && CCleanup::CleanupAuthList(m_Book.SetAuthors(), fix_initials)) {
512         change = true;
513     }
514     if (m_Book.IsSetImp() && CleanImprint(m_Book.SetImp(), eImprintBC_ForbidStatusChange)) {
515         change = true;
516     }
517 
518     return change;
519 }
520 
521 
Clean(bool fix_initials,bool strip_serial)522 bool CCitJourCleaner::Clean(bool fix_initials, bool strip_serial)
523 {
524     bool change = false;
525     if (m_Jour.IsSetImp()) {
526         change |= CleanImprint(m_Jour.SetImp(), eImprintBC_AllowStatusChange);
527     }
528 
529     return change;
530 }
531 
532 
Clean(bool fix_initials,bool strip_serial)533 bool CCitProcCleaner::Clean(bool fix_initials, bool strip_serial)
534 {
535     bool change = false;
536     if (m_Proc.IsSetBook()) {
537         CCitBookCleaner cleaner(m_Proc.SetBook());
538         change = cleaner.Clean(fix_initials, strip_serial);
539     }
540     return change;
541 }
542 
543 
CleanImprint(CImprint & imprint,EImprintBC is_status_change_allowed)544 bool CPubCleaner::CleanImprint(CImprint& imprint, EImprintBC is_status_change_allowed)
545 {
546     bool any_change = false;
547     if (is_status_change_allowed == eImprintBC_AllowStatusChange) {
548         if (imprint.IsSetPubstatus()) {
549             auto pubstatus = imprint.GetPubstatus();
550             switch (pubstatus) {
551             case ePubStatus_aheadofprint:
552                 if (!imprint.IsSetPrepub() || imprint.GetPrepub() != CImprint::ePrepub_in_press)
553                 {
554                     if (!imprint.IsSetVolume() || NStr::IsBlank(imprint.GetVolume())
555                         || !imprint.IsSetPages() || NStr::IsBlank(imprint.GetPages())) {
556                         imprint.SetPrepub(CImprint::ePrepub_in_press);
557                         any_change = true;
558                     }
559                 }
560                 else if (imprint.IsSetVolume() && !NStr::IsBlank(imprint.GetVolume())
561                     && imprint.IsSetPages() && !NStr::IsBlank(imprint.GetPages())) {
562                     imprint.ResetPrepub();
563                     any_change = true;
564                 }
565                 break;
566             case ePubStatus_epublish:
567                 if (imprint.IsSetPrepub() && imprint.GetPrepub() == CImprint::ePrepub_in_press) {
568                     imprint.ResetPrepub();
569                     any_change = true;
570                 }
571                 break;
572             default:
573                 break;
574             }
575         }
576     }
577 #define FIX_IMPRINT_FIELD(x) \
578     if (imprint.IsSet##x()) { \
579         string& str = imprint.Set##x(); \
580         const size_t old_len = str.length(); \
581         Asn2gnbkCompressSpaces(str); \
582         CleanVisString(str); \
583         if( old_len != str.length() ) { \
584             any_change = true; \
585         } \
586         if (NStr::IsBlank(str)) { \
587             imprint.Reset##x(); \
588             any_change = true; \
589         } \
590     }
591 
592     FIX_IMPRINT_FIELD(Volume);
593     FIX_IMPRINT_FIELD(Issue);
594     FIX_IMPRINT_FIELD(Pages);
595     FIX_IMPRINT_FIELD(Section);
596     FIX_IMPRINT_FIELD(Part_sup);
597     FIX_IMPRINT_FIELD(Language);
598     FIX_IMPRINT_FIELD(Part_supi);
599 #undef FIX_IMPRINT_FIELD
600     return any_change;
601 }
602 
603 
Clean(bool fix_initials,bool strip_serial)604 bool CCitPatCleaner::Clean(bool fix_initials, bool strip_serial)
605 {
606     bool change = false;
607     if (m_Pat.IsSetAuthors() && CCleanup::CleanupAuthList(m_Pat.SetAuthors(), fix_initials)) {
608         change = true;
609     }
610     if (m_Pat.IsSetApplicants() && CCleanup::CleanupAuthList(m_Pat.SetApplicants(), fix_initials)) {
611         change = true;
612     }
613     if (m_Pat.IsSetAssignees() && CCleanup::CleanupAuthList(m_Pat.SetAssignees(), fix_initials)) {
614         change = true;
615     }
616 
617     if (m_Pat.IsSetCountry()) {
618         if (NStr::Equal(m_Pat.GetCountry(), "USA")) {
619             m_Pat.SetCountry("US");
620             change = true;
621         }
622     }
623 
624     return change;
625 }
626 
627 
Clean(bool fix_initials,bool strip_serial)628 bool CCitLetCleaner::Clean(bool fix_initials, bool strip_serial)
629 {
630     bool change = false;
631     if (m_Let.IsSetCit() && m_Let.IsSetType() && m_Let.GetType() == CCit_let::eType_thesis) {
632         CCitBookCleaner cleaner(m_Let.SetCit());
633         if (cleaner.Clean(fix_initials, strip_serial)) {
634             change = true;
635         }
636     }
637 
638     return change;
639 }
640 
641 
Clean(bool fix_initials,bool strip_serial)642 bool CMedlineEntryCleaner::Clean(bool fix_initials, bool strip_serial)
643 {
644     bool change = false;
645     if (m_Men.IsSetCit() && m_Men.GetCit().IsSetAuthors()) {
646         change = CCleanup::CleanupAuthList(m_Men.SetCit().SetAuthors(), fix_initials);
647     }
648 
649     return change;
650 }
651 
652 
653 END_SCOPE(objects)
654 END_NCBI_SCOPE
655