1 /* $Id: cleanup_utils.cpp 632626 2021-06-03 17:38:42Z ivanov $
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * Author:  Mati Shomrat
27  *
28  * File Description:
29  *   General utilities for data cleanup.
30  *
31  * ===========================================================================
32  */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include "cleanup_utils.hpp"
36 
37 #include <objmgr/util/seq_loc_util.hpp>
38 #include <objmgr/util/sequence.hpp>
39 #include <objects/seq/Pubdesc.hpp>
40 #include <objects/pub/Pub_equiv.hpp>
41 #include <objects/pub/Pub.hpp>
42 #include <objects/biblio/Cit_sub.hpp>
43 #include <objects/biblio/Cit_gen.hpp>
44 #include <objects/biblio/Auth_list.hpp>
45 #include <objects/biblio/Affil.hpp>
46 #include <objects/biblio/Author.hpp>
47 #include <objects/biblio/Imprint.hpp>
48 #include <objects/general/Date.hpp>
49 #include <objects/general/Person_id.hpp>
50 #include <objects/general/Name_std.hpp>
51 
52 #include <objects/seq/Seqdesc.hpp>
53 #include <objects/seq/MolInfo.hpp>
54 #include <objects/seq/seq_loc_from_string.hpp>
55 #include <objects/seqfeat/Org_ref.hpp>
56 #include <objects/misc/sequence_macros.hpp>
57 
58 #include <objmgr/seqdesc_ci.hpp>
59 
60 #include <objtools/cleanup/cleanup_pub.hpp>
61 
62 
63 BEGIN_NCBI_SCOPE
64 BEGIN_SCOPE(objects)
65 
66 #define IS_LOWER(c)     ('a'<=(c) && (c)<='z')
67 #define IS_UPPER(c)     ('A'<=(c) && (c)<='Z')
68 
69 using namespace sequence;
70 
CleanVisString(string & str)71 bool CleanVisString( string &str )
72 {
73     bool changed = false;
74 
75     if( str.empty() ) {
76         return false;
77     }
78 
79     // chop off initial junk
80     {
81         string::size_type first_good_char_pos = str.find_first_not_of(" ;,");
82         if( first_good_char_pos == string::npos ) {
83             // string is completely junk
84             str.clear();
85             return true;
86         } else if( first_good_char_pos > 0 ) {
87             copy( str.begin() + first_good_char_pos, str.end(), str.begin() );
88             str.resize( str.length() - first_good_char_pos );
89             changed = true;
90         }
91     }
92 
93     // chop off end junk
94 
95     string::size_type last_good_char_pos = str.find_last_not_of(" ;,");
96     _ASSERT( last_good_char_pos != string::npos ); // we checked this case so it shouldn't happen
97     if( last_good_char_pos == (str.length() - 1) ) {
98         // nothing to chop of the end
99         return changed;
100     } else if( str[last_good_char_pos+1] == ';' ) {
101         // special extra logic for semicolons because it might be part of
102         // an HTML character like "&nbsp;"
103 
104         // see if there's a '&' before the semicolon
105         // ( ' ' and ',' would break the '&' and make it irrelevant, though )
106         string::size_type last_ampersand_pos = str.find_last_of("& ,", last_good_char_pos );
107         if( last_ampersand_pos == string::npos ) {
108             // no ampersand, so just chop off as normal
109             str.resize( last_good_char_pos + 1 );
110             return true;
111         }
112         switch( str[last_ampersand_pos] ) {
113             case '&':
114                 // can't chop semicolon, so chop just after it
115                 if( (last_good_char_pos + 2) == str.length() ) {
116                     // semicolon is at end, so no chopping occurs
117                     return changed;
118                 } else {
119                     // chop after semicolon
120                     str.resize( last_good_char_pos + 2 );
121                     return true;
122                 }
123             case ' ':
124             case ',':
125                 // ampersand (if any) is irrelevant due to intervening
126                 // space or comma
127                 str.resize( last_good_char_pos + 1 );
128                 return true;
129             default:
130                 _ASSERT(false);
131                 return changed;  // should be impossible to reach here
132         }
133 
134     } else {
135         str.resize( last_good_char_pos + 1 );
136         return true;
137     }
138 }
139 
CleanVisStringJunk(string & str,bool allow_ellipses)140 bool CleanVisStringJunk( string &str, bool allow_ellipses )
141 {
142     // This is based on the C function TrimSpacesAndJunkFromEnds.
143     // Although it's updated to use iterators and such and to
144     // return whether it changed the string, it should
145     // have the same output.
146 
147     // TODO: This function is copy-pasted from TrimSpacesAndJunkFromEnds,
148     // so we should do something about that since duplicate code is evil.
149 
150     if ( str.empty() ) {
151         return false;
152     }
153 
154     // make start_of_junk_pos hold the beginning of the "junk" at the end
155     // (where junk is defined as one of several characters)
156     // while we're at it, also check if the junk contains a tilde and/or period
157     bool isPeriod = false;
158     bool isTilde = false;
159     int start_of_junk_pos = str.length() - 1;
160     for( ; start_of_junk_pos >= 0 ; --start_of_junk_pos ) {
161         const char ch = str[start_of_junk_pos];
162         if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
163             // found junk character
164 
165             // also, keep track of whether the junk includes a period and/or tilde
166             isPeriod = (isPeriod || ch == '.');
167             isTilde = (isTilde || ch == '~');
168         } else {
169             // found non-junk character.  Last junk character is just after this
170             ++start_of_junk_pos;
171             break;
172         }
173     }
174     // special case of the whole string being junk
175     if( start_of_junk_pos < 0 ) {
176         start_of_junk_pos = 0;
177     }
178 
179     bool changed = false;
180 
181     // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
182     if ( start_of_junk_pos < (int)str.length() ) {
183 
184         // holds the suffix to add after we remove the junk
185         const char * suffix = ""; // by default, just remove junk
186 
187         const int chars_in_junk = ( str.length() - start_of_junk_pos );
188         _ASSERT( chars_in_junk >= 1 );
189         // allow one period at end
190         if (isPeriod) {
191             suffix = ".";
192             if ( allow_ellipses && (chars_in_junk >= 3) &&
193                 str[start_of_junk_pos+1] == '.' && str[start_of_junk_pos+2] == '.' ) {
194                 suffix = "...";
195             }
196         } else if (isTilde ) {
197             // allow double tilde(s) at the end
198             if ( str[start_of_junk_pos] == '~' ) {
199                 const bool doubleTilde = ( (chars_in_junk >= 2) && str[start_of_junk_pos+1] == '~' );
200                 suffix = ( doubleTilde  ? "~~" : "" );
201             }
202         }
203         if( suffix[0] != '\0' ) {
204             if( 0 != str.compare( start_of_junk_pos, INT_MAX, suffix) ) {
205                 str.erase( start_of_junk_pos );
206                 str += suffix;
207                 changed = true;
208             }
209         } else if ( start_of_junk_pos < (int)str.length() ) {
210             str.erase( start_of_junk_pos );
211             changed = true;
212         }
213     }
214 
215     // copy the part after the initial whitespace to the destination
216     string::iterator input_iter = str.begin();
217     while ( input_iter != str.end() && *input_iter <= ' ') {
218         ++input_iter;
219     }
220     if( input_iter != str.begin() ) {
221         str.erase( str.begin(), input_iter );
222         changed = true;
223     }
224 
225     return changed;
226 }
227 
228 
RemoveSpacesBetweenTildes(string & str)229 bool  RemoveSpacesBetweenTildes(string& str)
230 {
231     static string whites(" \t\n\r");
232     bool changed = false;
233     SIZE_TYPE tilde1 = str.find('~');
234     if (tilde1 == NPOS) {
235         return changed; // no tildes in str.
236     }
237     SIZE_TYPE tilde2 = str.find_first_not_of(whites, tilde1 + 1);
238     while (tilde2 != NPOS) {
239         if (str[tilde2] == '~') {
240             if ( tilde2 > tilde1 + 1) {
241                 // found two tildes with only spaces between them.
242                 str.erase(tilde1+1, tilde2 - tilde1 - 1);
243                 ++tilde1;
244                 changed = true;
245             } else {
246                 // found two tildes side by side.
247                 tilde1 = tilde2;
248             }
249         } else {
250             // found a tilde with non-space non-tilde after it.
251             tilde1 = str.find('~', tilde2 + 1);
252             if (tilde1 == NPOS) {
253                 return changed; // no more tildes in str.
254             }
255         }
256         tilde2 = str.find_first_not_of(whites, tilde1 + 1);
257     }
258     return changed;
259 
260 }
261 
262 
CleanDoubleQuote(string & str)263 bool CleanDoubleQuote(string& str)
264 {
265     bool changed = false;
266     NON_CONST_ITERATE(string, it, str) {
267         if (*it == '\"') {
268             *it = '\'';
269             changed = true;
270         }
271     }
272     return changed;
273 }
274 
275 
TrimInternalSemicolons(string & str)276 void TrimInternalSemicolons (string& str)
277 {
278     size_t pos, next_pos;
279 
280     pos = NStr::Find (str, ";");
281     while (pos != string::npos) {
282         next_pos = pos + 1;
283         bool has_space = false;
284         while (next_pos < str.length() && (str[next_pos] == ';' || str[next_pos] == ' ' || str[next_pos] == '\t')) {
285             if (str[next_pos] == ' ') {
286                 has_space = true;
287             }
288             next_pos++;
289         }
290         if (next_pos == pos + 1 || (has_space && next_pos == pos + 2)) {
291             // nothing to fix, advance semicolon search
292             pos = NStr::Find (str, ";", next_pos);
293         } else if (next_pos == str.length()) {
294             // nothing but semicolons, spaces, and tabs from here to the end of the string
295             // just truncate it
296             str = str.substr(0, pos);
297             pos = string::npos;
298         } else {
299             if (has_space) {
300                 str = str.substr(0, pos + 1) + " " + str.substr(next_pos);
301             } else {
302                 str = str.substr(0, pos + 1) + str.substr(next_pos);
303             }
304             pos = NStr::Find (str, ";", pos + 1);
305         }
306     }
307 }
308 
309 #define twocommas ((',') << 8 | (','))
310 #define twospaces ((' ') << 8 | (' '))
311 #define twosemicolons ((';') << 8 | (';'))
312 #define space_comma ((' ') << 8 | (','))
313 #define space_bracket ((' ') << 8 | (')'))
314 #define bracket_space (('(') << 8 | (' '))
315 #define space_semicolon ((' ') << 8 | (';'))
316 #define comma_space ((',') << 8 | (' '))
317 #define semicolon_space ((';') << 8 | (' '))
318 
Asn2gnbkCompressSpaces(string & val)319 bool Asn2gnbkCompressSpaces(string& val)
320 {
321     if (val.length() == 0) return false;
322 
323     char * str = new char[sizeof(char) * (val.length() + 1)];
324     strcpy(str, val.c_str());
325 
326     char     ch;
327     char *   dst;
328     char *   ptr;
329 
330     char     curr;
331     char     next;
332     char *   in;
333     char *   out;
334     unsigned short   two_chars;
335 
336 
337     in = str;
338     out = str;
339 
340     curr = *in;
341     in++;
342 
343     next = 0;
344     two_chars = curr;
345 
346     while (curr != '\0') {
347         next = *in;
348         in++;
349 
350         two_chars = (two_chars << 8) | next;
351 
352         if (two_chars == twocommas) {
353             *out++ = curr;
354             next = ' ';
355             two_chars = next;
356         }
357         else if (two_chars == twospaces) {
358         }
359         else if (two_chars == twosemicolons) {
360         }
361         else if (two_chars == bracket_space) {
362             next = curr;
363             two_chars = curr;
364         }
365         else if (two_chars == space_bracket) {
366         }
367         else if (two_chars == space_comma) {
368             *out++ = next;
369             next = curr;
370             *out++ = ' ';
371             while (next == ' ' || next == ',') {
372                 next = *in;
373                 in++;
374             }
375             two_chars = next;
376         }
377         else if (two_chars == space_semicolon) {
378             *out++ = next;
379             next = curr;
380             *out++ = ' ';
381             while (next == ' ' || next == ';') {
382                 next = *in;
383                 in++;
384             }
385             two_chars = next;
386         }
387         else if (two_chars == comma_space) {
388             *out++ = curr;
389             *out++ = ' ';
390             while (next == ' ' || next == ',') {
391                 next = *in;
392                 in++;
393             }
394             two_chars = next;
395         }
396         else if (two_chars == semicolon_space) {
397             *out++ = curr;
398             *out++ = ' ';
399             while (next == ' ' || next == ';') {
400                 next = *in;
401                 in++;
402             }
403             two_chars = next;
404         }
405         else {
406             *out++ = curr;
407         }
408 
409         curr = next;
410     }
411 
412     *out = '\0';
413 
414     /* TrimSpacesAroundString but allow leading/trailing tabs/newlines */
415 
416     if (str[0] != '\0') {
417         dst = str;
418         ptr = str;
419         ch = *ptr;
420         while (ch == ' ') {
421             ptr++;
422             ch = *ptr;
423         }
424         while (ch != '\0') {
425             *dst = ch;
426             dst++;
427             ptr++;
428             ch = *ptr;
429         }
430         *dst = '\0';
431         dst = NULL;
432         ptr = str;
433         ch = *ptr;
434         while (ch != '\0') {
435             if (ch != ' ') {
436                 dst = NULL;
437             }
438             else if (dst == NULL) {
439                 dst = ptr;
440             }
441             ptr++;
442             ch = *ptr;
443         }
444         if (dst != NULL) {
445             *dst = '\0';
446         }
447     }
448     string new_val;
449     new_val = str;
450     delete[] str;
451 
452     if (!NStr::Equal(val, new_val)) {
453 #ifdef _DEBUG
454 #if 0
455         printf("Use new string\n");
456 #endif
457 #endif
458         val = new_val;
459         return true;
460     }
461     else {
462         return false;
463     }
464 }
465 
TrimSpacesSemicolonsAndCommas(string & val)466 bool TrimSpacesSemicolonsAndCommas(string& val)
467 {
468     if (val.length() == 0) return false;
469 
470     char * str = new char[sizeof(char) * (val.length() + 1)];
471     strcpy(str, val.c_str());
472 
473     char *  amp;
474     unsigned char    ch;    /* to use 8bit characters in multibyte languages */
475     char *  dst;
476     char *  ptr;
477 
478     dst = str;
479     ptr = str;
480     ch = *ptr;
481     if (ch != '\0' && (ch <= ' ' || ch == ';' || ch == ',')) {
482         while (ch != '\0' && (ch <= ' ' || ch == ';' || ch == ',')) {
483             ptr++;
484             ch = *ptr;
485         }
486         while (ch != '\0') {
487             *dst = ch;
488             dst++;
489             ptr++;
490             ch = *ptr;
491         }
492         *dst = '\0';
493     }
494     amp = NULL;
495     dst = NULL;
496     ptr = str;
497     ch = *ptr;
498     while (ch != '\0') {
499         if (ch == '&') {
500             amp = ptr;
501             dst = NULL;
502         }
503         else if (ch <= ' ') {
504             if (dst == NULL) {
505                 dst = ptr;
506             }
507             amp = NULL;
508         }
509         else if (ch == ';') {
510             if (dst == NULL && amp == NULL) {
511                 dst = ptr;
512             }
513         }
514         else if (ch == ',') {
515             if (dst == NULL) {
516                 dst = ptr;
517             }
518             amp = NULL;
519         }
520         else {
521             dst = NULL;
522         }
523         ptr++;
524         ch = *ptr;
525     }
526     if (dst != NULL) {
527         *dst = '\0';
528     }
529 
530     string new_val;
531     new_val = str;
532     delete[] str;
533 
534     if (!NStr::Equal(val, new_val)) {
535 #ifdef _DEBUG
536 #if 0
537         printf("Use new string\n");
538 #endif
539 #endif
540         val = new_val;
541         return true;
542     }
543     else {
544         return false;
545     }
546 }
547 
548 
RemoveSpaces(string & str)549 bool RemoveSpaces(string& str)
550 {
551     if (str.empty()) {
552         return false;
553     }
554 
555     size_t next = 0;
556 
557     NON_CONST_ITERATE(string, it, str) {
558         if (!isspace((unsigned char)(*it))) {
559             str[next++] = *it;
560         }
561     }
562     if (next < str.length()) {
563         str.resize(next);
564         return true;
565     }
566     return false;
567 }
568 
569 class CGetSeqLocFromStringHelper_ReadLocFromText : public CGetSeqLocFromStringHelper {
570 public:
CGetSeqLocFromStringHelper_ReadLocFromText(CScope * scope)571     CGetSeqLocFromStringHelper_ReadLocFromText( CScope *scope )
572         : m_scope(scope) { }
573 
Seq_loc_Add(const CSeq_loc & loc1,const CSeq_loc & loc2,CSeq_loc::TOpFlags flags)574     virtual CRef<CSeq_loc> Seq_loc_Add(
575         const CSeq_loc&    loc1,
576         const CSeq_loc&    loc2,
577         CSeq_loc::TOpFlags flags )
578     {
579         return sequence::Seq_loc_Add( loc1, loc2, flags, m_scope );
580     }
581 
582 private:
583     CScope *m_scope;
584 };
585 
ReadLocFromText(const string & text,const CSeq_id * id,CScope * scope)586 CRef<CSeq_loc> ReadLocFromText(const string& text, const CSeq_id *id, CScope *scope)
587 {
588     CGetSeqLocFromStringHelper_ReadLocFromText helper(scope);
589     return GetSeqLocFromString(text, id, &helper);
590 }
591 
592 typedef struct proteinabbrev {
593      string abbreviation;
594     char letter;
595 } ProteinAbbrevData;
596 
597 static ProteinAbbrevData abbreviation_list[] =
598 {
599     {"Ala", 'A'},
600     {"Asx", 'B'},
601     {"Cys", 'C'},
602     {"Asp", 'D'},
603     {"Glu", 'E'},
604     {"Phe", 'F'},
605     {"Gly", 'G'},
606     {"His", 'H'},
607     {"Ile", 'I'},
608     {"Xle", 'J'},  /* was - notice no 'J', breaks naive meaning of index -Karl */
609     {"Lys", 'K'},
610     {"Leu", 'L'},
611     {"Met", 'M'},
612     {"Asn", 'N'},
613     {"Pyl", 'O'},  /* was - no 'O' */
614     {"Pro", 'P'},
615     {"Gln", 'Q'},
616     {"Arg", 'R'},
617     {"Ser", 'S'},
618     {"Thr", 'T'},
619     {"Val", 'V'},
620     {"Trp", 'W'},
621     {"Sec", 'U'}, /* was - not in iupacaa */
622     {"Xxx", 'X'},
623     {"Tyr", 'Y'},
624     {"Glx", 'Z'},
625     {"TERM", '*'}, /* not in iupacaa */ /*changed by Tatiana 06.07.95?`*/
626     {"OTHER", 'X'}
627 };
628 
629 // Find the single-letter abbreviation for either the single letter abbreviation
630 // or three-letter abbreviation.
631 // Use X if the abbreviation is not found.
632 
ValidAminoAcid(const string & abbrev)633 char ValidAminoAcid (const string& abbrev)
634 {
635     char ch = 'X';
636 
637     for (unsigned int k = 0; k < sizeof(abbreviation_list) / sizeof (ProteinAbbrevData); k++) {
638         if (NStr::EqualNocase (abbrev, abbreviation_list[k].abbreviation)) {
639             ch = abbreviation_list[k].letter;
640             break;
641         }
642     }
643 
644     if (abbrev.length() == 1) {
645         for (unsigned int k = 0; k < sizeof(abbreviation_list) / sizeof (ProteinAbbrevData); k++) {
646             if (abbrev.c_str()[0] == abbreviation_list[k].letter) {
647                 ch = abbreviation_list[k].letter;
648                 break;
649             }
650         }
651     }
652 
653     return ch;
654 }
655 
656 
s_DbtagCompare(const CRef<CDbtag> & dbt1,const CRef<CDbtag> & dbt2)657 bool s_DbtagCompare (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
658 {
659     // is dbt1 < dbt2
660     return dbt1->Compare(*dbt2) < 0;
661 }
662 
663 
s_DbtagEqual(const CRef<CDbtag> & dbt1,const CRef<CDbtag> & dbt2)664 bool s_DbtagEqual (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
665 {
666     // is dbt1 == dbt2
667     return dbt1->Compare(*dbt2) == 0;
668 }
669 
s_OrgrefSynCompare(const string & syn1,const string & syn2)670 bool s_OrgrefSynCompare( const string & syn1, const string & syn2 )
671 {
672     return NStr::CompareNocase(syn1, syn2) < 0;
673 }
674 
s_OrgrefSynEqual(const string & syn1,const string & syn2)675 bool s_OrgrefSynEqual( const string & syn1, const string & syn2 )
676 {
677     return NStr::EqualNocase(syn1, syn2);
678 }
679 
680 
681 END_SCOPE(objects)
682 END_NCBI_SCOPE
683