1 /* $Id: cleanup_utils.cpp 632626 2021-06-03 17:38:42Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat
27 *
28 * File Description:
29 * General utilities for data cleanup.
30 *
31 * ===========================================================================
32 */
33 #include <ncbi_pch.hpp>
34 #include <corelib/ncbistd.hpp>
35 #include "cleanup_utils.hpp"
36
37 #include <objmgr/util/seq_loc_util.hpp>
38 #include <objmgr/util/sequence.hpp>
39 #include <objects/seq/Pubdesc.hpp>
40 #include <objects/pub/Pub_equiv.hpp>
41 #include <objects/pub/Pub.hpp>
42 #include <objects/biblio/Cit_sub.hpp>
43 #include <objects/biblio/Cit_gen.hpp>
44 #include <objects/biblio/Auth_list.hpp>
45 #include <objects/biblio/Affil.hpp>
46 #include <objects/biblio/Author.hpp>
47 #include <objects/biblio/Imprint.hpp>
48 #include <objects/general/Date.hpp>
49 #include <objects/general/Person_id.hpp>
50 #include <objects/general/Name_std.hpp>
51
52 #include <objects/seq/Seqdesc.hpp>
53 #include <objects/seq/MolInfo.hpp>
54 #include <objects/seq/seq_loc_from_string.hpp>
55 #include <objects/seqfeat/Org_ref.hpp>
56 #include <objects/misc/sequence_macros.hpp>
57
58 #include <objmgr/seqdesc_ci.hpp>
59
60 #include <objtools/cleanup/cleanup_pub.hpp>
61
62
63 BEGIN_NCBI_SCOPE
64 BEGIN_SCOPE(objects)
65
66 #define IS_LOWER(c) ('a'<=(c) && (c)<='z')
67 #define IS_UPPER(c) ('A'<=(c) && (c)<='Z')
68
69 using namespace sequence;
70
CleanVisString(string & str)71 bool CleanVisString( string &str )
72 {
73 bool changed = false;
74
75 if( str.empty() ) {
76 return false;
77 }
78
79 // chop off initial junk
80 {
81 string::size_type first_good_char_pos = str.find_first_not_of(" ;,");
82 if( first_good_char_pos == string::npos ) {
83 // string is completely junk
84 str.clear();
85 return true;
86 } else if( first_good_char_pos > 0 ) {
87 copy( str.begin() + first_good_char_pos, str.end(), str.begin() );
88 str.resize( str.length() - first_good_char_pos );
89 changed = true;
90 }
91 }
92
93 // chop off end junk
94
95 string::size_type last_good_char_pos = str.find_last_not_of(" ;,");
96 _ASSERT( last_good_char_pos != string::npos ); // we checked this case so it shouldn't happen
97 if( last_good_char_pos == (str.length() - 1) ) {
98 // nothing to chop of the end
99 return changed;
100 } else if( str[last_good_char_pos+1] == ';' ) {
101 // special extra logic for semicolons because it might be part of
102 // an HTML character like " "
103
104 // see if there's a '&' before the semicolon
105 // ( ' ' and ',' would break the '&' and make it irrelevant, though )
106 string::size_type last_ampersand_pos = str.find_last_of("& ,", last_good_char_pos );
107 if( last_ampersand_pos == string::npos ) {
108 // no ampersand, so just chop off as normal
109 str.resize( last_good_char_pos + 1 );
110 return true;
111 }
112 switch( str[last_ampersand_pos] ) {
113 case '&':
114 // can't chop semicolon, so chop just after it
115 if( (last_good_char_pos + 2) == str.length() ) {
116 // semicolon is at end, so no chopping occurs
117 return changed;
118 } else {
119 // chop after semicolon
120 str.resize( last_good_char_pos + 2 );
121 return true;
122 }
123 case ' ':
124 case ',':
125 // ampersand (if any) is irrelevant due to intervening
126 // space or comma
127 str.resize( last_good_char_pos + 1 );
128 return true;
129 default:
130 _ASSERT(false);
131 return changed; // should be impossible to reach here
132 }
133
134 } else {
135 str.resize( last_good_char_pos + 1 );
136 return true;
137 }
138 }
139
CleanVisStringJunk(string & str,bool allow_ellipses)140 bool CleanVisStringJunk( string &str, bool allow_ellipses )
141 {
142 // This is based on the C function TrimSpacesAndJunkFromEnds.
143 // Although it's updated to use iterators and such and to
144 // return whether it changed the string, it should
145 // have the same output.
146
147 // TODO: This function is copy-pasted from TrimSpacesAndJunkFromEnds,
148 // so we should do something about that since duplicate code is evil.
149
150 if ( str.empty() ) {
151 return false;
152 }
153
154 // make start_of_junk_pos hold the beginning of the "junk" at the end
155 // (where junk is defined as one of several characters)
156 // while we're at it, also check if the junk contains a tilde and/or period
157 bool isPeriod = false;
158 bool isTilde = false;
159 int start_of_junk_pos = str.length() - 1;
160 for( ; start_of_junk_pos >= 0 ; --start_of_junk_pos ) {
161 const char ch = str[start_of_junk_pos];
162 if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
163 // found junk character
164
165 // also, keep track of whether the junk includes a period and/or tilde
166 isPeriod = (isPeriod || ch == '.');
167 isTilde = (isTilde || ch == '~');
168 } else {
169 // found non-junk character. Last junk character is just after this
170 ++start_of_junk_pos;
171 break;
172 }
173 }
174 // special case of the whole string being junk
175 if( start_of_junk_pos < 0 ) {
176 start_of_junk_pos = 0;
177 }
178
179 bool changed = false;
180
181 // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
182 if ( start_of_junk_pos < (int)str.length() ) {
183
184 // holds the suffix to add after we remove the junk
185 const char * suffix = ""; // by default, just remove junk
186
187 const int chars_in_junk = ( str.length() - start_of_junk_pos );
188 _ASSERT( chars_in_junk >= 1 );
189 // allow one period at end
190 if (isPeriod) {
191 suffix = ".";
192 if ( allow_ellipses && (chars_in_junk >= 3) &&
193 str[start_of_junk_pos+1] == '.' && str[start_of_junk_pos+2] == '.' ) {
194 suffix = "...";
195 }
196 } else if (isTilde ) {
197 // allow double tilde(s) at the end
198 if ( str[start_of_junk_pos] == '~' ) {
199 const bool doubleTilde = ( (chars_in_junk >= 2) && str[start_of_junk_pos+1] == '~' );
200 suffix = ( doubleTilde ? "~~" : "" );
201 }
202 }
203 if( suffix[0] != '\0' ) {
204 if( 0 != str.compare( start_of_junk_pos, INT_MAX, suffix) ) {
205 str.erase( start_of_junk_pos );
206 str += suffix;
207 changed = true;
208 }
209 } else if ( start_of_junk_pos < (int)str.length() ) {
210 str.erase( start_of_junk_pos );
211 changed = true;
212 }
213 }
214
215 // copy the part after the initial whitespace to the destination
216 string::iterator input_iter = str.begin();
217 while ( input_iter != str.end() && *input_iter <= ' ') {
218 ++input_iter;
219 }
220 if( input_iter != str.begin() ) {
221 str.erase( str.begin(), input_iter );
222 changed = true;
223 }
224
225 return changed;
226 }
227
228
RemoveSpacesBetweenTildes(string & str)229 bool RemoveSpacesBetweenTildes(string& str)
230 {
231 static string whites(" \t\n\r");
232 bool changed = false;
233 SIZE_TYPE tilde1 = str.find('~');
234 if (tilde1 == NPOS) {
235 return changed; // no tildes in str.
236 }
237 SIZE_TYPE tilde2 = str.find_first_not_of(whites, tilde1 + 1);
238 while (tilde2 != NPOS) {
239 if (str[tilde2] == '~') {
240 if ( tilde2 > tilde1 + 1) {
241 // found two tildes with only spaces between them.
242 str.erase(tilde1+1, tilde2 - tilde1 - 1);
243 ++tilde1;
244 changed = true;
245 } else {
246 // found two tildes side by side.
247 tilde1 = tilde2;
248 }
249 } else {
250 // found a tilde with non-space non-tilde after it.
251 tilde1 = str.find('~', tilde2 + 1);
252 if (tilde1 == NPOS) {
253 return changed; // no more tildes in str.
254 }
255 }
256 tilde2 = str.find_first_not_of(whites, tilde1 + 1);
257 }
258 return changed;
259
260 }
261
262
CleanDoubleQuote(string & str)263 bool CleanDoubleQuote(string& str)
264 {
265 bool changed = false;
266 NON_CONST_ITERATE(string, it, str) {
267 if (*it == '\"') {
268 *it = '\'';
269 changed = true;
270 }
271 }
272 return changed;
273 }
274
275
TrimInternalSemicolons(string & str)276 void TrimInternalSemicolons (string& str)
277 {
278 size_t pos, next_pos;
279
280 pos = NStr::Find (str, ";");
281 while (pos != string::npos) {
282 next_pos = pos + 1;
283 bool has_space = false;
284 while (next_pos < str.length() && (str[next_pos] == ';' || str[next_pos] == ' ' || str[next_pos] == '\t')) {
285 if (str[next_pos] == ' ') {
286 has_space = true;
287 }
288 next_pos++;
289 }
290 if (next_pos == pos + 1 || (has_space && next_pos == pos + 2)) {
291 // nothing to fix, advance semicolon search
292 pos = NStr::Find (str, ";", next_pos);
293 } else if (next_pos == str.length()) {
294 // nothing but semicolons, spaces, and tabs from here to the end of the string
295 // just truncate it
296 str = str.substr(0, pos);
297 pos = string::npos;
298 } else {
299 if (has_space) {
300 str = str.substr(0, pos + 1) + " " + str.substr(next_pos);
301 } else {
302 str = str.substr(0, pos + 1) + str.substr(next_pos);
303 }
304 pos = NStr::Find (str, ";", pos + 1);
305 }
306 }
307 }
308
309 #define twocommas ((',') << 8 | (','))
310 #define twospaces ((' ') << 8 | (' '))
311 #define twosemicolons ((';') << 8 | (';'))
312 #define space_comma ((' ') << 8 | (','))
313 #define space_bracket ((' ') << 8 | (')'))
314 #define bracket_space (('(') << 8 | (' '))
315 #define space_semicolon ((' ') << 8 | (';'))
316 #define comma_space ((',') << 8 | (' '))
317 #define semicolon_space ((';') << 8 | (' '))
318
Asn2gnbkCompressSpaces(string & val)319 bool Asn2gnbkCompressSpaces(string& val)
320 {
321 if (val.length() == 0) return false;
322
323 char * str = new char[sizeof(char) * (val.length() + 1)];
324 strcpy(str, val.c_str());
325
326 char ch;
327 char * dst;
328 char * ptr;
329
330 char curr;
331 char next;
332 char * in;
333 char * out;
334 unsigned short two_chars;
335
336
337 in = str;
338 out = str;
339
340 curr = *in;
341 in++;
342
343 next = 0;
344 two_chars = curr;
345
346 while (curr != '\0') {
347 next = *in;
348 in++;
349
350 two_chars = (two_chars << 8) | next;
351
352 if (two_chars == twocommas) {
353 *out++ = curr;
354 next = ' ';
355 two_chars = next;
356 }
357 else if (two_chars == twospaces) {
358 }
359 else if (two_chars == twosemicolons) {
360 }
361 else if (two_chars == bracket_space) {
362 next = curr;
363 two_chars = curr;
364 }
365 else if (two_chars == space_bracket) {
366 }
367 else if (two_chars == space_comma) {
368 *out++ = next;
369 next = curr;
370 *out++ = ' ';
371 while (next == ' ' || next == ',') {
372 next = *in;
373 in++;
374 }
375 two_chars = next;
376 }
377 else if (two_chars == space_semicolon) {
378 *out++ = next;
379 next = curr;
380 *out++ = ' ';
381 while (next == ' ' || next == ';') {
382 next = *in;
383 in++;
384 }
385 two_chars = next;
386 }
387 else if (two_chars == comma_space) {
388 *out++ = curr;
389 *out++ = ' ';
390 while (next == ' ' || next == ',') {
391 next = *in;
392 in++;
393 }
394 two_chars = next;
395 }
396 else if (two_chars == semicolon_space) {
397 *out++ = curr;
398 *out++ = ' ';
399 while (next == ' ' || next == ';') {
400 next = *in;
401 in++;
402 }
403 two_chars = next;
404 }
405 else {
406 *out++ = curr;
407 }
408
409 curr = next;
410 }
411
412 *out = '\0';
413
414 /* TrimSpacesAroundString but allow leading/trailing tabs/newlines */
415
416 if (str[0] != '\0') {
417 dst = str;
418 ptr = str;
419 ch = *ptr;
420 while (ch == ' ') {
421 ptr++;
422 ch = *ptr;
423 }
424 while (ch != '\0') {
425 *dst = ch;
426 dst++;
427 ptr++;
428 ch = *ptr;
429 }
430 *dst = '\0';
431 dst = NULL;
432 ptr = str;
433 ch = *ptr;
434 while (ch != '\0') {
435 if (ch != ' ') {
436 dst = NULL;
437 }
438 else if (dst == NULL) {
439 dst = ptr;
440 }
441 ptr++;
442 ch = *ptr;
443 }
444 if (dst != NULL) {
445 *dst = '\0';
446 }
447 }
448 string new_val;
449 new_val = str;
450 delete[] str;
451
452 if (!NStr::Equal(val, new_val)) {
453 #ifdef _DEBUG
454 #if 0
455 printf("Use new string\n");
456 #endif
457 #endif
458 val = new_val;
459 return true;
460 }
461 else {
462 return false;
463 }
464 }
465
TrimSpacesSemicolonsAndCommas(string & val)466 bool TrimSpacesSemicolonsAndCommas(string& val)
467 {
468 if (val.length() == 0) return false;
469
470 char * str = new char[sizeof(char) * (val.length() + 1)];
471 strcpy(str, val.c_str());
472
473 char * amp;
474 unsigned char ch; /* to use 8bit characters in multibyte languages */
475 char * dst;
476 char * ptr;
477
478 dst = str;
479 ptr = str;
480 ch = *ptr;
481 if (ch != '\0' && (ch <= ' ' || ch == ';' || ch == ',')) {
482 while (ch != '\0' && (ch <= ' ' || ch == ';' || ch == ',')) {
483 ptr++;
484 ch = *ptr;
485 }
486 while (ch != '\0') {
487 *dst = ch;
488 dst++;
489 ptr++;
490 ch = *ptr;
491 }
492 *dst = '\0';
493 }
494 amp = NULL;
495 dst = NULL;
496 ptr = str;
497 ch = *ptr;
498 while (ch != '\0') {
499 if (ch == '&') {
500 amp = ptr;
501 dst = NULL;
502 }
503 else if (ch <= ' ') {
504 if (dst == NULL) {
505 dst = ptr;
506 }
507 amp = NULL;
508 }
509 else if (ch == ';') {
510 if (dst == NULL && amp == NULL) {
511 dst = ptr;
512 }
513 }
514 else if (ch == ',') {
515 if (dst == NULL) {
516 dst = ptr;
517 }
518 amp = NULL;
519 }
520 else {
521 dst = NULL;
522 }
523 ptr++;
524 ch = *ptr;
525 }
526 if (dst != NULL) {
527 *dst = '\0';
528 }
529
530 string new_val;
531 new_val = str;
532 delete[] str;
533
534 if (!NStr::Equal(val, new_val)) {
535 #ifdef _DEBUG
536 #if 0
537 printf("Use new string\n");
538 #endif
539 #endif
540 val = new_val;
541 return true;
542 }
543 else {
544 return false;
545 }
546 }
547
548
RemoveSpaces(string & str)549 bool RemoveSpaces(string& str)
550 {
551 if (str.empty()) {
552 return false;
553 }
554
555 size_t next = 0;
556
557 NON_CONST_ITERATE(string, it, str) {
558 if (!isspace((unsigned char)(*it))) {
559 str[next++] = *it;
560 }
561 }
562 if (next < str.length()) {
563 str.resize(next);
564 return true;
565 }
566 return false;
567 }
568
569 class CGetSeqLocFromStringHelper_ReadLocFromText : public CGetSeqLocFromStringHelper {
570 public:
CGetSeqLocFromStringHelper_ReadLocFromText(CScope * scope)571 CGetSeqLocFromStringHelper_ReadLocFromText( CScope *scope )
572 : m_scope(scope) { }
573
Seq_loc_Add(const CSeq_loc & loc1,const CSeq_loc & loc2,CSeq_loc::TOpFlags flags)574 virtual CRef<CSeq_loc> Seq_loc_Add(
575 const CSeq_loc& loc1,
576 const CSeq_loc& loc2,
577 CSeq_loc::TOpFlags flags )
578 {
579 return sequence::Seq_loc_Add( loc1, loc2, flags, m_scope );
580 }
581
582 private:
583 CScope *m_scope;
584 };
585
ReadLocFromText(const string & text,const CSeq_id * id,CScope * scope)586 CRef<CSeq_loc> ReadLocFromText(const string& text, const CSeq_id *id, CScope *scope)
587 {
588 CGetSeqLocFromStringHelper_ReadLocFromText helper(scope);
589 return GetSeqLocFromString(text, id, &helper);
590 }
591
592 typedef struct proteinabbrev {
593 string abbreviation;
594 char letter;
595 } ProteinAbbrevData;
596
597 static ProteinAbbrevData abbreviation_list[] =
598 {
599 {"Ala", 'A'},
600 {"Asx", 'B'},
601 {"Cys", 'C'},
602 {"Asp", 'D'},
603 {"Glu", 'E'},
604 {"Phe", 'F'},
605 {"Gly", 'G'},
606 {"His", 'H'},
607 {"Ile", 'I'},
608 {"Xle", 'J'}, /* was - notice no 'J', breaks naive meaning of index -Karl */
609 {"Lys", 'K'},
610 {"Leu", 'L'},
611 {"Met", 'M'},
612 {"Asn", 'N'},
613 {"Pyl", 'O'}, /* was - no 'O' */
614 {"Pro", 'P'},
615 {"Gln", 'Q'},
616 {"Arg", 'R'},
617 {"Ser", 'S'},
618 {"Thr", 'T'},
619 {"Val", 'V'},
620 {"Trp", 'W'},
621 {"Sec", 'U'}, /* was - not in iupacaa */
622 {"Xxx", 'X'},
623 {"Tyr", 'Y'},
624 {"Glx", 'Z'},
625 {"TERM", '*'}, /* not in iupacaa */ /*changed by Tatiana 06.07.95?`*/
626 {"OTHER", 'X'}
627 };
628
629 // Find the single-letter abbreviation for either the single letter abbreviation
630 // or three-letter abbreviation.
631 // Use X if the abbreviation is not found.
632
ValidAminoAcid(const string & abbrev)633 char ValidAminoAcid (const string& abbrev)
634 {
635 char ch = 'X';
636
637 for (unsigned int k = 0; k < sizeof(abbreviation_list) / sizeof (ProteinAbbrevData); k++) {
638 if (NStr::EqualNocase (abbrev, abbreviation_list[k].abbreviation)) {
639 ch = abbreviation_list[k].letter;
640 break;
641 }
642 }
643
644 if (abbrev.length() == 1) {
645 for (unsigned int k = 0; k < sizeof(abbreviation_list) / sizeof (ProteinAbbrevData); k++) {
646 if (abbrev.c_str()[0] == abbreviation_list[k].letter) {
647 ch = abbreviation_list[k].letter;
648 break;
649 }
650 }
651 }
652
653 return ch;
654 }
655
656
s_DbtagCompare(const CRef<CDbtag> & dbt1,const CRef<CDbtag> & dbt2)657 bool s_DbtagCompare (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
658 {
659 // is dbt1 < dbt2
660 return dbt1->Compare(*dbt2) < 0;
661 }
662
663
s_DbtagEqual(const CRef<CDbtag> & dbt1,const CRef<CDbtag> & dbt2)664 bool s_DbtagEqual (const CRef<CDbtag>& dbt1, const CRef<CDbtag>& dbt2)
665 {
666 // is dbt1 == dbt2
667 return dbt1->Compare(*dbt2) == 0;
668 }
669
s_OrgrefSynCompare(const string & syn1,const string & syn2)670 bool s_OrgrefSynCompare( const string & syn1, const string & syn2 )
671 {
672 return NStr::CompareNocase(syn1, syn2) < 0;
673 }
674
s_OrgrefSynEqual(const string & syn1,const string & syn2)675 bool s_OrgrefSynEqual( const string & syn1, const string & syn2 )
676 {
677 return NStr::EqualNocase(syn1, syn2);
678 }
679
680
681 END_SCOPE(objects)
682 END_NCBI_SCOPE
683