1 /* $Id: cleanup_pub.cpp 632626 2021-06-03 17:38:42Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Colleen Bollin
27 *
28 * File Description:
29 * Code for cleaning up publications
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 #include <serial/serialbase.hpp>
35
36 #include <objects/biblio/Affil.hpp>
37 #include <objects/biblio/ArticleId.hpp>
38 #include <objects/biblio/ArticleIdSet.hpp>
39 #include <objects/biblio/Author.hpp>
40 #include <objects/biblio/Auth_list.hpp>
41 #include <objects/biblio/Cit_art.hpp>
42 #include <objects/biblio/Imprint.hpp>
43 #include <objects/general/Name_std.hpp>
44 #include <objects/general/Person_id.hpp>
45
46 #include <objects/seq/Pubdesc.hpp>
47 #include <objects/pub/Pub_equiv.hpp>
48
49 #include <objtools/cleanup/cleanup.hpp>
50 #include <objtools/cleanup/cleanup_pub.hpp>
51 #include "cleanup_utils.hpp"
52 #include <objmgr/util/objutil.hpp>
53
54 BEGIN_NCBI_SCOPE
BEGIN_SCOPE(objects)55 BEGIN_SCOPE(objects)
56
57
58 bool CCleanupPub::x_CleanPubdescComment(string& str)
59 {
60 bool any_change = false;
61 if (CleanDoubleQuote(str)) {
62 any_change = true;
63 }
64 if (CleanVisString(str)) {
65 any_change = true;
66 }
67 return any_change;
68 }
69
CleanPubdesc(CPubdesc & pubdesc,bool strip_serial)70 bool CCleanupPub::CleanPubdesc(CPubdesc& pubdesc, bool strip_serial)
71 {
72 bool any_change = false;
73 if (pubdesc.IsSetComment()) {
74 string& comment = pubdesc.SetComment();
75 any_change |= x_CleanPubdescComment(comment);
76 if (comment.empty()) {
77 pubdesc.ResetComment();
78 any_change = true;
79 }
80 }
81
82 if (pubdesc.IsSetPub()) {
83 CPubEquivCleaner cleaner(pubdesc.SetPub());
84 bool fix_initials = CPubEquivCleaner::ShouldWeFixInitials(pubdesc.GetPub());
85 if (cleaner.Clean(fix_initials, strip_serial)) {
86 any_change = true;
87 }
88 }
89 return any_change;
90 }
91
92
s_PubPriority(CPub::E_Choice val)93 static size_t s_PubPriority(CPub::E_Choice val)
94 {
95 size_t priority = 0;
96 switch (val) {
97 case CPub::e_not_set:
98 priority = 0;
99 break;
100 case CPub::e_Gen:
101 priority = 3;
102 break;
103 case CPub::e_Sub:
104 priority = 4;
105 break;
106 case CPub::e_Medline:
107 priority = 13;
108 break;
109 case CPub::e_Muid:
110 priority = 2;
111 break;
112 case CPub::e_Article:
113 priority = 5;
114 break;
115 case CPub::e_Journal:
116 priority = 6;
117 break;
118 case CPub::e_Book:
119 priority = 7;
120 break;
121 case CPub::e_Proc:
122 priority = 8;
123 break;
124 case CPub::e_Patent:
125 priority = 9;
126 break;
127 case CPub::e_Pat_id:
128 priority = 10;
129 break;
130 case CPub::e_Man:
131 priority = 11;
132 break;
133 case CPub::e_Equiv:
134 priority = 12;
135 break;
136 case CPub::e_Pmid:
137 priority = 1;
138 break;
139 }
140 return priority;
141 }
142
143 inline
144 static
s_PubWhichCompare(CRef<CPub> pub1,CRef<CPub> pub2)145 bool s_PubWhichCompare(CRef<CPub> pub1, CRef<CPub> pub2) {
146 size_t pr1 = s_PubPriority(pub1->Which());
147 size_t pr2 = s_PubPriority(pub2->Which());
148 return (pr1 < pr2);
149 }
150
151
152 struct SPMIDMatch {
153 const CPubMedId& m_ID;
154
operator ()SPMIDMatch155 bool operator()(CRef< CArticleId > other_id)
156 {
157 return (other_id->IsPubmed() && other_id->GetPubmed() == m_ID);
158 }
159 };
160
RemoveDuplicatePubMedArticleIds(CArticleIdSet::Tdata & id_set)161 void RemoveDuplicatePubMedArticleIds(CArticleIdSet::Tdata& id_set)
162 {
163 auto it = id_set.begin();
164 while (it != id_set.end()) {
165 while (it != id_set.end() && !(*it)->IsPubmed()) {
166 ++it;
167 }
168 if (it != id_set.end()) {
169 auto it2 = it;
170 ++it2;
171 SPMIDMatch matcher{ (*it)->GetPubmed() };
172 id_set.erase(std::remove_if(it2, id_set.end(), matcher), id_set.end());
173 ++it;
174 }
175 }
176
177 }
178
Clean(bool fix_initials,bool strip_serial)179 bool CPubEquivCleaner::Clean(bool fix_initials, bool strip_serial)
180 {
181 bool change = false;
182
183 if (!m_Equiv.IsSet()) {
184 return change;
185 }
186
187 if (s_Flatten(m_Equiv)) {
188 change = true;
189 }
190
191 // we keep the last of these because we might transfer one
192 // to the other as necessary to fill in gaps.
193 TEntrezId last_pmid = ZERO_ENTREZ_ID;
194 TEntrezId last_article_pubmed_id = ZERO_ENTREZ_ID; // the last from a journal
195 CRef<CCit_art> last_article;
196
197 auto& pe_set = m_Equiv.Set();
198
199 pe_set.sort(s_PubWhichCompare);
200
201 auto it = pe_set.begin();
202 while (it != pe_set.end()) {
203 CPub &pub = **it;
204
205 CRef<CPubCleaner> cleaner = PubCleanerFactory(pub);
206 if (cleaner) {
207 if (cleaner->Clean(fix_initials, strip_serial)) {
208 change = true;
209 }
210 if (cleaner->IsEmpty()) {
211 it = pe_set.erase(it);
212 continue;
213 }
214 }
215
216 // storing these so at the end we'll know the last values
217 if (pub.IsPmid()) {
218 last_pmid = pub.GetPmid().Get();
219 }
220 if (pub.IsArticle()) {
221 last_article.Reset(&pub.SetArticle());
222 if (last_article->IsSetIds()) {
223 auto& ids = last_article->SetIds().Set();
224 size_t old_size = ids.size();
225 RemoveDuplicatePubMedArticleIds(last_article->SetIds());
226 change = (ids.size() != old_size);
227 // find last article pubmed_id
228 auto id_it = ids.rbegin();
229 while (id_it != ids.rend()) {
230 if ((*id_it)->IsPubmed()) {
231 last_article_pubmed_id = (*id_it)->GetPubmed();
232 break;
233 }
234 ++id_it;
235 }
236 }
237 }
238 ++it;
239 }
240
241 // Now, we might have to transfer data to fill in missing information
242 if (last_pmid == ZERO_ENTREZ_ID && last_article_pubmed_id > ZERO_ENTREZ_ID) {
243 CRef<CPub> new_pub(new CPub);
244 new_pub->SetPmid().Set(last_article_pubmed_id);
245 m_Equiv.Set().insert(m_Equiv.Set().begin(), new_pub);
246 change = true;
247 }
248 else if (last_pmid > ZERO_ENTREZ_ID && last_article_pubmed_id == ZERO_ENTREZ_ID && last_article) {
249 CRef<CArticleId> new_article_id(new CArticleId);
250 new_article_id->SetPubmed().Set(last_pmid);
251 last_article->SetIds().Set().push_back(new_article_id);
252 change = true;
253 }
254 return change;
255 }
256
257
IsEmpty()258 bool CPubEquivCleaner::IsEmpty()
259 {
260 return !m_Equiv.IsSet() || m_Equiv.Get().empty();
261 }
262
ShouldWeFixInitials(const CPub_equiv & equiv)263 bool CPubEquivCleaner::ShouldWeFixInitials(const CPub_equiv& equiv)
264 {
265 if (!equiv.IsSet()) {
266 return false;
267 }
268 #if 0
269 bool has_id = false,
270 has_art = false;
271
272 for (auto it : equiv.Get()) {
273 if ((it->IsPmid() && it->GetPmid() > 0) ||
274 (it->IsMuid() && it->GetMuid() > 0)) {
275 has_id = true;
276 }
277 else if (it->IsArticle()) {
278 has_art = true;
279 }
280 }
281 // return !(has_art && has_id);
282 #endif
283 return true;
284 }
285
286
s_Flatten(CPub_equiv & pub_equiv)287 bool CPubEquivCleaner::s_Flatten(CPub_equiv& pub_equiv)
288 {
289 bool any_change = false;
290 CPub_equiv::Tdata& data = pub_equiv.Set();
291
292 auto it = data.begin();
293 while (it != data.end()) {
294 if ((*it)->IsEquiv()) {
295 CPub_equiv& sub_equiv = (*it)->SetEquiv();
296 s_Flatten(sub_equiv);
297 copy(sub_equiv.Set().begin(), sub_equiv.Set().end(), back_inserter(data));
298 it = data.erase(it);
299 any_change = true;
300 }
301 else {
302 ++it;
303 }
304 }
305 return any_change;
306 }
307
308
309
310
311
312
313
PubCleanerFactory(CPub & pub)314 CRef<CPubCleaner> PubCleanerFactory(CPub& pub)
315 {
316 switch (pub.Which()) {
317 case CPub::e_Gen:
318 return CRef<CPubCleaner>(new CCitGenCleaner(pub.SetGen()));
319 break;
320 case CPub::e_Equiv:
321 return CRef<CPubCleaner>(new CPubEquivCleaner(pub.SetEquiv()));
322 break;
323 case CPub::e_Sub:
324 return CRef<CPubCleaner>(new CCitSubCleaner(pub.SetSub()));
325 break;
326 case CPub::e_Article:
327 return CRef<CPubCleaner>(new CCitArtCleaner(pub.SetArticle()));
328 break;
329 case CPub::e_Journal:
330 return CRef<CPubCleaner>(new CCitJourCleaner(pub.SetJournal()));
331 break;
332 case CPub::e_Book:
333 return CRef<CPubCleaner>(new CCitBookCleaner(pub.SetBook()));
334 break;
335 case CPub::e_Proc:
336 return CRef<CPubCleaner>(new CCitProcCleaner(pub.SetProc()));
337 break;
338 case CPub::e_Patent:
339 return CRef<CPubCleaner>(new CCitPatCleaner(pub.SetPatent()));
340 break;
341 case CPub::e_Man:
342 return CRef<CPubCleaner>(new CCitLetCleaner(pub.SetMan()));
343 break;
344 case CPub::e_Medline:
345 return CRef<CPubCleaner>(new CMedlineEntryCleaner(pub.SetMedline()));
346 break;
347 default:
348 return CRef<CPubCleaner>(NULL);
349 }
350 }
351
352
Clean(bool fix_initials,bool strip_serial)353 bool CCitGenCleaner::Clean(bool fix_initials, bool strip_serial)
354 {
355 bool rval = false;
356 if (m_Gen.IsSetAuthors()) {
357 if (CCleanup::CleanupAuthList(m_Gen.SetAuthors(), fix_initials)) {
358 rval = true;
359 }
360 }
361 if (m_Gen.IsSetCit()) {
362 CCit_gen::TCit& cit = m_Gen.SetCit();
363 if (NStr::StartsWith(cit, "unpublished", NStr::eNocase) && cit[0] != 'U') {
364 cit[0] = 'U';
365 rval = true;
366 }
367 if (!m_Gen.IsSetJournal()
368 && (m_Gen.IsSetVolume() || m_Gen.IsSetPages() || m_Gen.IsSetIssue()))
369 {
370 m_Gen.ResetVolume();
371 m_Gen.ResetPages();
372 m_Gen.ResetIssue();
373 rval = true;
374 }
375 const size_t old_cit_size = cit.size();
376 NStr::TruncateSpacesInPlace(cit);
377 if (old_cit_size != cit.size()) {
378 rval = true;
379 }
380 }
381 if (m_Gen.IsSetPages()) {
382 if (RemoveSpaces(m_Gen.SetPages())) {
383 rval = true;
384 }
385 }
386
387 // title strstripspaces (see 8728 in sqnutil1.c, Mar 11, 2011)
388 if (m_Gen.IsSetTitle() && StripSpaces(m_Gen.SetTitle())) {
389 rval = true;
390 }
391
392 if (strip_serial && m_Gen.IsSetSerial_number()) {
393 m_Gen.ResetSerial_number();
394 rval = true;
395 }
396
397 // erase if the Cit-gen is now entirely blank
398 return rval;
399 }
400
401
IsEmpty()402 bool CCitGenCleaner::IsEmpty()
403 {
404 return (!m_Gen.IsSetCit()) &&
405 !m_Gen.IsSetAuthors() &&
406 (!m_Gen.IsSetMuid() || m_Gen.GetMuid() <= ZERO_ENTREZ_ID) &&
407 !m_Gen.IsSetJournal() &&
408 (!m_Gen.IsSetVolume() || m_Gen.GetVolume().empty()) &&
409 (!m_Gen.IsSetIssue() || m_Gen.GetIssue().empty()) &&
410 (!m_Gen.IsSetPages() || m_Gen.GetPages().empty()) &&
411 !m_Gen.IsSetDate() &&
412 (!m_Gen.IsSetSerial_number() || m_Gen.GetSerial_number() <= 0) &&
413 (!m_Gen.IsSetTitle() || m_Gen.GetTitle().empty()) &&
414 (!m_Gen.IsSetPmid() || m_Gen.GetPmid().Get() <= ZERO_ENTREZ_ID);
415 }
416
417
Clean(bool fix_initials,bool strip_serial)418 bool CCitSubCleaner::Clean(bool fix_initials, bool strip_serial)
419 {
420 bool any_change = false;
421
422 if (m_Sub.IsSetAuthors()) {
423 auto& authors = m_Sub.SetAuthors();
424 if (CCleanup::CleanupAuthList(authors, fix_initials)) {
425 any_change = true;
426 }
427 if (!authors.IsSetAffil() && m_Sub.IsSetImp()) {
428 auto& imp = m_Sub.SetImp();
429 if (imp.IsSetPub()) {
430 authors.SetAffil(imp.SetPub());
431 imp.ResetPub();
432 any_change = true;
433 }
434 }
435 if (authors.IsSetAffil()) {
436 auto& affil = authors.SetAffil();
437 if (affil.IsStr()) {
438 string &str = affil.SetStr();
439 static const string& kBadAffil1 = "to the DDBJ/EMBL/GenBank databases";
440 static const string& kBadAffil2 = "to the INSDC databases";
441 if (NStr::StartsWith(str, kBadAffil1)) {
442 str = str.substr(kBadAffil1.length());
443 NStr::TrimPrefixInPlace(str, ".");
444 any_change = true;
445 }
446 if (NStr::StartsWith(str, kBadAffil2)) {
447 str = str.substr(kBadAffil2.length());
448 NStr::TrimPrefixInPlace(str, ".");
449 any_change = true;
450 }
451
452 if (CCleanup::CleanupAffil(affil)) {
453 any_change = true;
454 }
455 if (CCleanup::IsEmpty(affil)) {
456 authors.ResetAffil();
457 any_change = true;
458 }
459 }
460
461 }
462 }
463 if (m_Sub.IsSetImp() && !m_Sub.IsSetDate()) {
464 auto& imp = m_Sub.SetImp();
465 if (imp.IsSetDate()) {
466 m_Sub.SetDate().Assign(imp.GetDate());
467 m_Sub.ResetImp();
468 }
469 any_change = true;
470 }
471
472 return any_change;
473 }
474
475
IsEmpty()476 bool CCitSubCleaner::IsEmpty()
477 {
478 return false;
479 }
480
481
Clean(bool fix_initials,bool strip_serial)482 bool CCitArtCleaner::Clean(bool fix_initials, bool strip_serial)
483 {
484 bool change = false;
485 if (m_Art.IsSetAuthors()) {
486 if (CCleanup::CleanupAuthList(m_Art.SetAuthors(), fix_initials)) {
487 change = true;
488 }
489 }
490 if (m_Art.IsSetFrom()) {
491 auto& from = m_Art.SetFrom();
492 if (from.IsBook()) {
493 CCitBookCleaner cleaner(from.SetBook());
494 change |= cleaner.Clean(fix_initials, strip_serial);
495 } else if (from.IsProc()) {
496 CCitProcCleaner cleaner(from.SetProc());
497 change |= cleaner.Clean(fix_initials, strip_serial);
498 } else if (from.IsJournal()) {
499 CCitJourCleaner cleaner(from.SetJournal());
500 change |= cleaner.Clean(fix_initials, strip_serial);
501 }
502 }
503
504 return change;
505 }
506
507
Clean(bool fix_initials,bool strip_serial)508 bool CCitBookCleaner::Clean(bool fix_initials, bool strip_serial)
509 {
510 bool change = false;
511 if (m_Book.IsSetAuthors() && CCleanup::CleanupAuthList(m_Book.SetAuthors(), fix_initials)) {
512 change = true;
513 }
514 if (m_Book.IsSetImp() && CleanImprint(m_Book.SetImp(), eImprintBC_ForbidStatusChange)) {
515 change = true;
516 }
517
518 return change;
519 }
520
521
Clean(bool fix_initials,bool strip_serial)522 bool CCitJourCleaner::Clean(bool fix_initials, bool strip_serial)
523 {
524 bool change = false;
525 if (m_Jour.IsSetImp()) {
526 change |= CleanImprint(m_Jour.SetImp(), eImprintBC_AllowStatusChange);
527 }
528
529 return change;
530 }
531
532
Clean(bool fix_initials,bool strip_serial)533 bool CCitProcCleaner::Clean(bool fix_initials, bool strip_serial)
534 {
535 bool change = false;
536 if (m_Proc.IsSetBook()) {
537 CCitBookCleaner cleaner(m_Proc.SetBook());
538 change = cleaner.Clean(fix_initials, strip_serial);
539 }
540 return change;
541 }
542
543
CleanImprint(CImprint & imprint,EImprintBC is_status_change_allowed)544 bool CPubCleaner::CleanImprint(CImprint& imprint, EImprintBC is_status_change_allowed)
545 {
546 bool any_change = false;
547 if (is_status_change_allowed == eImprintBC_AllowStatusChange) {
548 if (imprint.IsSetPubstatus()) {
549 auto pubstatus = imprint.GetPubstatus();
550 switch (pubstatus) {
551 case ePubStatus_aheadofprint:
552 if (!imprint.IsSetPrepub() || imprint.GetPrepub() != CImprint::ePrepub_in_press)
553 {
554 if (!imprint.IsSetVolume() || NStr::IsBlank(imprint.GetVolume())
555 || !imprint.IsSetPages() || NStr::IsBlank(imprint.GetPages())) {
556 imprint.SetPrepub(CImprint::ePrepub_in_press);
557 any_change = true;
558 }
559 }
560 else if (imprint.IsSetVolume() && !NStr::IsBlank(imprint.GetVolume())
561 && imprint.IsSetPages() && !NStr::IsBlank(imprint.GetPages())) {
562 imprint.ResetPrepub();
563 any_change = true;
564 }
565 break;
566 case ePubStatus_epublish:
567 if (imprint.IsSetPrepub() && imprint.GetPrepub() == CImprint::ePrepub_in_press) {
568 imprint.ResetPrepub();
569 any_change = true;
570 }
571 break;
572 default:
573 break;
574 }
575 }
576 }
577 #define FIX_IMPRINT_FIELD(x) \
578 if (imprint.IsSet##x()) { \
579 string& str = imprint.Set##x(); \
580 const size_t old_len = str.length(); \
581 Asn2gnbkCompressSpaces(str); \
582 CleanVisString(str); \
583 if( old_len != str.length() ) { \
584 any_change = true; \
585 } \
586 if (NStr::IsBlank(str)) { \
587 imprint.Reset##x(); \
588 any_change = true; \
589 } \
590 }
591
592 FIX_IMPRINT_FIELD(Volume);
593 FIX_IMPRINT_FIELD(Issue);
594 FIX_IMPRINT_FIELD(Pages);
595 FIX_IMPRINT_FIELD(Section);
596 FIX_IMPRINT_FIELD(Part_sup);
597 FIX_IMPRINT_FIELD(Language);
598 FIX_IMPRINT_FIELD(Part_supi);
599 #undef FIX_IMPRINT_FIELD
600 return any_change;
601 }
602
603
Clean(bool fix_initials,bool strip_serial)604 bool CCitPatCleaner::Clean(bool fix_initials, bool strip_serial)
605 {
606 bool change = false;
607 if (m_Pat.IsSetAuthors() && CCleanup::CleanupAuthList(m_Pat.SetAuthors(), fix_initials)) {
608 change = true;
609 }
610 if (m_Pat.IsSetApplicants() && CCleanup::CleanupAuthList(m_Pat.SetApplicants(), fix_initials)) {
611 change = true;
612 }
613 if (m_Pat.IsSetAssignees() && CCleanup::CleanupAuthList(m_Pat.SetAssignees(), fix_initials)) {
614 change = true;
615 }
616
617 if (m_Pat.IsSetCountry()) {
618 if (NStr::Equal(m_Pat.GetCountry(), "USA")) {
619 m_Pat.SetCountry("US");
620 change = true;
621 }
622 }
623
624 return change;
625 }
626
627
Clean(bool fix_initials,bool strip_serial)628 bool CCitLetCleaner::Clean(bool fix_initials, bool strip_serial)
629 {
630 bool change = false;
631 if (m_Let.IsSetCit() && m_Let.IsSetType() && m_Let.GetType() == CCit_let::eType_thesis) {
632 CCitBookCleaner cleaner(m_Let.SetCit());
633 if (cleaner.Clean(fix_initials, strip_serial)) {
634 change = true;
635 }
636 }
637
638 return change;
639 }
640
641
Clean(bool fix_initials,bool strip_serial)642 bool CMedlineEntryCleaner::Clean(bool fix_initials, bool strip_serial)
643 {
644 bool change = false;
645 if (m_Men.IsSetCit() && m_Men.GetCit().IsSetAuthors()) {
646 change = CCleanup::CleanupAuthList(m_Men.SetCit().SetAuthors(), fix_initials);
647 }
648
649 return change;
650 }
651
652
653 END_SCOPE(objects)
654 END_NCBI_SCOPE
655