1 /* $Id: objutil.cpp 632875 2021-06-09 14:33:04Z ivanov $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author: Mati Shomrat, NCBI
27 *
28 * File Description:
29 * shared utility functions
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34
35 #include <util/strsearch.hpp>
36
37 #include <objects/general/Date.hpp>
38 #include <objects/general/User_object.hpp>
39 #include <objects/general/User_field.hpp>
40 #include <objects/general/Object_id.hpp>
41 #include <objects/general/Date.hpp>
42 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_inst.hpp>
44 #include <objects/seq/Seq_ext.hpp>
45 #include <objects/seq/Delta_ext.hpp>
46 #include <objects/seq/Delta_seq.hpp>
47 #include <objects/seq/Seq_literal.hpp>
48 #include <objects/seq/MolInfo.hpp>
49 #include <objects/seq/seqport_util.hpp>
50 #include <objects/seqloc/Seq_loc.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/bioseq_handle.hpp>
53 #include <objmgr/seqdesc_ci.hpp>
54 #include <objmgr/object_manager.hpp>
55 #include <objmgr/util/sequence.hpp>
56 #include <objects/general/general_macros.hpp>
57 #include <algorithm>
58 #include <objmgr/util/objutil.hpp>
59
60
61 BEGIN_NCBI_SCOPE
62 BEGIN_SCOPE(objects)
63
64
65 SAFE_CONST_STATIC_STRING(kLegalPathChars, "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_-.");
66
IsPartOfUrl(const string & sentence,size_t pos)67 bool IsPartOfUrl(const string& sentence, size_t pos)
68 {
69 string separators( "( \t\r\n" );
70 const string& legal_path_chars = kLegalPathChars.Get();
71
72 //
73 // Weed out silly input:
74 //
75 if ( sentence == "" || pos > sentence.length() - 1 ) {
76 return false;
77 }
78 if ( string::npos != separators.find( sentence[ pos ] ) ) {
79 return false;
80 }
81
82 // Do easy tests first:
83
84 // We require the tilde to show up in a pattern like
85 // "/~[0..9A..Za..z_-.]+". This is inherited from the C toolkit flat file
86 // generator:
87 //
88 if ( (pos < 1) || (sentence[ pos-1 ] != '/') ) {
89 return false;
90 }
91
92 //
93 // Find the start of the "word" that surrounds the given position:
94 //
95 separators += '~';
96 string::size_type left_edge = sentence.find_last_of( separators, pos-1 );
97 if ( left_edge == string::npos ) {
98 left_edge = 0;
99 }
100 else {
101 ++left_edge;
102 }
103
104 //
105 // If it's a URL, it better start with a protocol specifier we approve of:
106 //
107 static const char* sc_ProtocolSpecifiers[] = {
108 "URL:",
109 "http:",
110 "https:",
111 };
112 DEFINE_STATIC_ARRAY_MAP_WITH_COPY(CStaticArraySet<string>, vProtocolSpecifiers, sc_ProtocolSpecifiers);
113 size_t colon = sentence.find( ':', left_edge );
114 if ( colon == string::npos ) {
115 return false;
116 }
117 string strMaybeUrl = sentence.substr( left_edge, colon - left_edge + 1 );
118 if ( vProtocolSpecifiers.find( strMaybeUrl ) == vProtocolSpecifiers.end() ) {
119 return false;
120 }
121
122 ++pos;
123 if ( string::npos == legal_path_chars.find( sentence[ pos ] ) ) {
124 return false;
125 }
126
127 for ( ++pos; sentence[ pos ] != 0; ++pos ) {
128 if ( string::npos == legal_path_chars.find( sentence[ pos ] ) ) {
129 return ( sentence[ pos ] == '/' );
130 }
131 }
132
133 return false; /* never found the terminating '/' */
134 };
135
136
s_RunOfStars(string & s,SIZE_TYPE start,SIZE_TYPE length)137 static bool s_RunOfStars(string& s, SIZE_TYPE start, SIZE_TYPE length)
138 {
139 SIZE_TYPE max = start + 66;
140 if (max >= length) {
141 return false;
142 }
143 for (SIZE_TYPE i = start; i < max; i++) {
144 if (s[i] != '*') {
145 return false;
146 }
147 }
148 return true;
149 }
150
151
ExpandTildes(string & s,ETildeStyle style)152 void ExpandTildes(string& s, ETildeStyle style)
153 {
154 if ( style == eTilde_tilde ) {
155 return;
156 }
157
158 SIZE_TYPE start = 0, tilde, length = s.length();
159
160 tilde = s.find('~', start);
161 if (tilde == NPOS) { // no tilde
162 return;
163 }
164
165 string result;
166
167 while ( (start < length) && (tilde = s.find('~', start)) != NPOS ) {
168 result.append(s, start, tilde - start);
169 char next = (tilde + 1) < length ? s[tilde + 1] : 0;
170 switch ( style ) {
171 case eTilde_space:
172 if ( (tilde + 1 < length && isdigit((unsigned char) next) ) ||
173 (tilde + 2 < length && (next == ' ' || next == '(') &&
174 isdigit((unsigned char) s[tilde + 2]))) {
175 result += '~';
176 } else {
177 result += ' ';
178 }
179 start = tilde + 1;
180 break;
181
182 case eTilde_newline:
183 if ( tilde + 1 < length && s[tilde + 1] == '~' ) {
184 result += '~';
185 start = tilde + 2;
186 } else {
187 result += "\n";
188 start = tilde + 1;
189 }
190 break;
191
192 case eTilde_note:
193 if ( tilde + 1 < length && s[tilde + 1] == '~' ) {
194 result += '~';
195 start = tilde + 2;
196 } else {
197 // plain "~" expands to ";\n", unless it's after a space or semi-colon, in
198 // which case it becomes a plain "\n"
199 char prevChar = ( tilde >= 1 ? s[tilde - 1] : '\0' );
200
201 if( ' ' == prevChar || ';' == prevChar ) {
202 result += '\n';
203 } else {
204 result += ";\n";
205 }
206 start = tilde + 1;
207 }
208 break;
209
210 case eTilde_comment:
211 if (tilde > 0 && s[tilde - 1] == '`') {
212 result.replace(result.length() - 1, 1, 1,'~');
213 }
214 else if ( IsPartOfUrl( s, tilde ) ) {
215 result += '~';
216 }
217 else {
218 result += "\n";
219 }
220 start = tilde + 1;
221 if (s[start] == ' ' && s_RunOfStars(s, start+1, length)) {
222 start++;
223 result += '\n';
224 }
225 break;
226
227 default: // just keep it, for lack of better ideas
228 result += '~';
229 start = tilde + 1;
230 break;
231 }
232 }
233 if (start < length) {
234 result.append(s, start, NPOS);
235 }
236 s.swap(result);
237 }
238
239
ConvertQuotes(string & str)240 void ConvertQuotes(string& str)
241 {
242 replace(str.begin(), str.end(), '\"', '\'');
243 }
244
245
ConvertQuotes(const string & str)246 string ConvertQuotes(const string& str)
247 {
248 string retval = str;
249 ConvertQuotes(retval);
250 return retval;
251 }
252
253 // Strips all spaces in string in following manner. If the function
254 // meet several spaces (spaces and tabs) in succession it replaces them
255 // with one space. Strips all spaces after '(' and before ( ')' or ',' ).
StripSpaces(string & str)256 bool StripSpaces(string& str)
257 {
258 if (str.empty()) {
259 return false;
260 }
261 auto orig_len = str.length();
262
263 NStr::ReplaceInPlace(str, "\t", " ");
264 auto this_len = str.length();
265 NStr::ReplaceInPlace(str, " ", " ");
266 while (str.length() != this_len) {
267 this_len = str.length();
268 NStr::ReplaceInPlace(str, " ", " ");
269 }
270 NStr::ReplaceInPlace(str, "( ", "(");
271 NStr::ReplaceInPlace(str, " )", ")");
272 NStr::ReplaceInPlace(str, " ,", ",");
273
274 #if 0
275
276 string::iterator end = str.end();
277 string::iterator it = str.begin();
278 string::iterator new_str = it;
279 while (it != end) {
280 *new_str++ = *it;
281 if ( (*it == ' ') || (*it == '\t') || (*it == '(') ) {
282 for (++it; it != end && (*it == ' ' || *it == '\t'); ++it)
283 continue;
284 if (it != end && (*it == ')' || *it == ',')) {
285 if( *(new_str - 1) != '(' ) { // this if protects against the case "(...bunch of spaces and tabs...)". Otherwise, the first '(' is erased
286 --new_str;
287 }
288 }
289 } else {
290 ++it;
291 }
292 }
293 str.erase(new_str, str.end());
294 #endif
295 return (orig_len != str.length());
296 }
297
298
RemovePeriodFromEnd(string & str,bool keep_ellipsis)299 bool RemovePeriodFromEnd(string& str, bool keep_ellipsis)
300 {
301
302 // NB: this is likely a better solution; however, the C toolkit differs...
303 //string::size_type pos = str.find_last_not_of(".,;:() ");
304 // string::size_type pos = str.find_last_not_of(".,;: ");
305 //string::size_type pos = str.find_last_not_of(".");
306 //string::size_type pos2 = str.find("...", pos);
307 //// string::size_type pos3 = str.find_first_of(".", pos);
308 //if (pos < str.size() - 1) {
309 // str.erase(pos + 1);
310 // if (keep_ellipsis && pos2 != string::npos) {
311 // str += "...";
312 // }
313 //}
314 //return ( pos != string::npos );
315
316 const string::size_type len = str.length();
317
318 if( keep_ellipsis ) {
319 if( len >= 3 && str[len-1] == '.' && str[len-2] == '.' && str[len-3] == '.' ) {
320 return false;
321 }
322 }
323
324 // chop off period if there's one at the end
325 if( len >= 1 && str[len-1] == '.' ) {
326 str.resize( len - 1 );
327 return true;
328 } else {
329 return false;
330 }
331
332 /* string::size_type pos2 = str.find_last_not_of(";,.");
333 string::size_type pos3 = str.find_last_not_of(" ", pos2);
334 if (pos3 < pos2) {
335 str.erase(pos3 + 1);
336 pos2 = str.find_last_not_of(";,.");
337 }
338
339 string::size_type pos = str.find_last_not_of(".");
340 if (pos2 < str.size() - 1) {
341 if (keep_ellipsis) {
342 /// trim the end to an actual ellipsis
343 if (str.length() - pos2 > 3) {
344 if (pos2 < pos) {
345 str.erase(pos2 + 1);
346 str += "...";
347 return true;
348 }
349 pos += 3;
350 }
351 else if (pos2 < pos) {
352 pos = pos2;
353 }
354 } else if (pos2 < pos) {
355 pos = pos2;
356 }
357 if (pos < str.size() - 1) {
358 str.erase(pos + 1);
359 return true;
360 }
361 } */
362
363 /**
364 static const char* kEllipsis = "...";
365
366 if ( NStr::EndsWith(str, '.') ) {
367 if ( !keep_ellipsis || !NStr::EndsWith(str, kEllipsis) ) {
368 str.erase(str.length() - 1);
369 return true;
370 }
371 }
372 **/
373 // return false;
374 }
375
376
AddPeriod(string & str)377 void AddPeriod(string& str)
378 {
379 size_t pos = str.find_last_not_of(" \t~.\n");
380 str.erase(pos + 1);
381 str += '.';
382 }
383
384
TrimSpaces(string & str,size_t indent)385 void TrimSpaces(string& str, size_t indent)
386 {
387 if (str.empty() || str.length() <= indent) {
388 return;
389 }
390
391 size_t end = str.length() - 1;
392 while (end >= indent && isspace((unsigned char) str[end])) {
393 end--;
394 }
395 if (end < indent) {
396 str.erase(indent);
397 } else {
398 str.erase(end + 1);
399 }
400 }
401
402 // needed because not all compilers will just let you pass "isgraph" to STL find_if
403 class CIsGraph
404 {
405 public:
operator ()(const char c)406 bool operator()( const char c ) {
407 return isgraph((unsigned char)c) != 0;
408 }
409 };
410
411 // This will compress multiple spaces in a row.
412 // It also translates unprintable characters to spaces.
413 // If trim_beginning, strips all spaces and unprintables from beginning of string.
414 // If trim_end, strips all spaces and unprintables from end of string.
415 // returns the string you gave it.
CompressSpaces(string & str,const bool trim_beginning,const bool trim_end)416 string& CompressSpaces( string& str, const bool trim_beginning, const bool trim_end )
417 {
418 if( str.empty() ) {
419 return str;
420 }
421
422 // set up start_iter and end_iter to determine the range in which we're looking
423
424 string::iterator start_iter = str.begin();
425 if( trim_beginning ) {
426 start_iter = find_if( str.begin(), str.end(), CIsGraph() );
427 }
428 if( str.end() == start_iter ) {
429 str.clear();
430 return str;
431 }
432
433 string::iterator end_iter = str.end();
434 if( trim_end ) {
435 string::reverse_iterator rev_iter = find_if( str.rbegin(), str.rend(), CIsGraph() );
436 end_iter = str.begin() + ( str.rend() - rev_iter );
437 }
438 if( str.begin() == end_iter ) {
439 str.clear();
440 return str;
441 }
442
443 // The main part, where we compress spaces
444 string newstr; // result will end up here
445 newstr.reserve( end_iter - start_iter );
446
447 // efficiency note: If the efficiency of unique_copy followed by transform becomes
448 // burdensome, we may have to replace these 2 calls with one raw loop that does
449 // what those calls do ( a sloppier and more bug-prone ( but faster ), prospect)
450
451 // copy such that consecutive spaces or control characters are compressed to one space
452 char last_ch_was_printable = true;
453 for( string::iterator iter = start_iter; iter < end_iter; ++iter ) {
454 const char ch = *iter;
455 if( isgraph(ch) ) {
456 // visible characters get copied straight
457 newstr += ch;
458 last_ch_was_printable = true;
459 } else {
460 // unprintable chars become space, and they're only appended if the last char was
461 // printable
462 if( last_ch_was_printable ) {
463 newstr += ' ';
464 }
465 last_ch_was_printable = false;
466 }
467 }
468
469 str.swap( newstr );
470 return str;
471 }
472
473
474 // returns true if it changed the string
TrimSpacesAndJunkFromEnds(string & str,bool allow_ellipsis)475 bool TrimSpacesAndJunkFromEnds(string& str, bool allow_ellipsis)
476 {
477 // TODO: This commented out code represents how ellipsis trimming
478 // should work. However, for compatibility with C, we're using a
479 // (in my opinion) suboptimal algorithm. We can switch over later.
480
481 //if (str.empty()) {
482 // return;
483 //}
484
485 //size_t strlen = str.length();
486 //size_t begin = 0;
487
488 //// trim unprintable characters (and space) off the beginning
489 //while (begin != strlen) {
490 // unsigned char ch = str[begin];
491 // if (ch > ' ') {
492 // break;
493 // } else {
494 // ++begin;
495 // }
496 //}
497
498 //// we're done if we trimmed the string to nothing
499 //if (begin == strlen) {
500 // str.erase();
501 // return;
502 //}
503
504 //// trim junk off the end (while we're at it, record whether we're chopping off a period)
505 //size_t end = strlen - 1;
506 //bool has_period = false;
507 //while (end > begin) {
508 // unsigned char ch = str[end];
509 // if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
510 // has_period = (has_period || ch == '.');
511 // --end;
512 // } else {
513 // break;
514 // }
515 //}
516
517 //// check whether we're about to chop off an ellipsis, so we remember to add it back
518 //// TODO: There's got to be a more efficient way of doing this
519 //const bool weChoppedOffAnEllipsis = ( NPOS != NStr::Find(str, "...", end) );
520
521 //// do the actual chopping here
522 //str = str.substr( begin, end + 1 );
523
524 //// restore chopped off ellipsis or period, if any
525 //if ( allow_ellipsis && weChoppedOffAnEllipsis ) {
526 // str += "...";
527 //} else if (has_period) {
528 // // re-add any periods if we had one before
529 // str += '.';
530 //}
531
532 // This is based on the C function TrimSpacesAndJunkFromEnds.
533 // Although it's updated to use iterators and such and to
534 // return whether it changed the string, it should
535 // have the same output, except:
536 // - We do NOT chop off a semicolon if we determine that it's
537 // part of an HTML escape char (e.g. "&bgr;" ).
538 // - There are some changes in how tildes are handled;
539 // this algo is less likely to remove them.
540
541 if ( str.empty() ) {
542 return false;
543 }
544
545 // make start_of_junk_pos hold the beginning of the "junk" at the end
546 // (where junk is defined as one of several characters)
547 // while we're at it, also check if the junk contains a tilde and/or period
548 bool isPeriod = false;
549 bool isTilde = false;
550 int start_of_junk_pos = (int)str.length() - 1;
551 for( ; start_of_junk_pos >= 0 ; --start_of_junk_pos ) {
552 const char ch = str[start_of_junk_pos];
553 if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
554 // found junk character
555
556 // also, keep track of whether the junk includes a period and/or tilde
557 isPeriod = (isPeriod || ch == '.');
558 isTilde = (isTilde || ch == '~');
559 } else {
560 // found non-junk character. Last junk character is just after this
561 ++start_of_junk_pos;
562 break;
563 }
564 }
565 // special case of the whole string being junk
566 if( start_of_junk_pos < 0 ) {
567 start_of_junk_pos = 0;
568 }
569
570 // check for ';' that's part of an HTML escape char like "&bgr;" and
571 // skip over it (i.e., don't remove it) if so
572 if( start_of_junk_pos < (int)str.length() && str[start_of_junk_pos] == ';' ) {
573 // we assume no HTML escape char will be longer than this
574 static const int kMaxCharsToLookAt = 20;
575
576 // go backwards, looking for the ampersand
577 int amp_iter = (start_of_junk_pos - 1);
578 for( ; amp_iter >= 0 && ((start_of_junk_pos - amp_iter) < kMaxCharsToLookAt); --amp_iter ) {
579 const char ch = str[amp_iter];
580 if( isalnum(ch) || ch == '#' ) {
581 // just keep going
582 } else if( ch == '&' ) {
583 // The semicolon ends an HTML escape character, so we skip it
584 ++start_of_junk_pos;
585 break;
586 } else {
587 // The semicolon does NOT end an HTML escape character, so we might remove it
588 break;
589 }
590 }
591 }
592
593 bool changed = false;
594
595 // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
596 if ( start_of_junk_pos < (int)str.length() ) {
597
598 // holds the suffix to add after we remove the junk
599 const char * suffix = ""; // by default, just remove junk
600
601 const int chars_in_junk = ( (int)str.length() - start_of_junk_pos );
602 _ASSERT( chars_in_junk >= 1 );
603
604 // allow one period at end
605 if (isPeriod) {
606 // check if we should put an ellipsis, or just a period
607 const bool putEllipsis = ( allow_ellipsis && (chars_in_junk >= 3) &&
608 str[start_of_junk_pos+1] == '.' && str[start_of_junk_pos+2] == '.' );
609
610 suffix = ( putEllipsis ? "..." : "." );
611 } else if (isTilde ) {
612 // allow tilde(s)
613 // (This should work on single- AND double-tildes because
614 // we don't know whether or not tilde-expansion was called before this
615 // point )
616 if ( str[start_of_junk_pos] == '~' ) {
617 const bool doubleTilde = ( (chars_in_junk >= 2) && str[start_of_junk_pos+1] == '~' );
618 suffix = ( doubleTilde ? "~~" : "~" );
619 }
620 }
621 if( suffix[0] != '\0' ) {
622 if( 0 != str.compare( start_of_junk_pos, INT_MAX, suffix) ) {
623 str.erase( start_of_junk_pos );
624 str += suffix;
625 changed = true;
626 }
627 } else if ( start_of_junk_pos < (int)str.length() ) {
628 str.erase( start_of_junk_pos );
629 changed = true;
630 }
631 }
632
633 // copy the part after the initial whitespace to the destination
634 string::iterator input_iter = str.begin();
635 while ( input_iter != str.end() && *input_iter <= ' ') {
636 ++input_iter;
637 }
638 if( input_iter != str.begin() ) {
639 str.erase( str.begin(), input_iter );
640 changed = true;
641 }
642
643 return changed;
644 }
645
646 // this is copy-pasted method and optimized to use CTempString
TrimSpacesAndJunkFromEnds(string & result,const CTempString & str,bool allow_ellipsis)647 void TrimSpacesAndJunkFromEnds(string& result, const CTempString& str, bool allow_ellipsis)
648 {
649 // TODO: This commented out code represents how ellipsis trimming
650 // should work. However, for compatibility with C, we're using a
651 // (in my opinion) suboptimal algorithm. We can switch over later.
652
653 //if (str.empty()) {
654 // return;
655 //}
656
657 //size_t strlen = str.length();
658 //size_t begin = 0;
659
660 //// trim unprintable characters (and space) off the beginning
661 //while (begin != strlen) {
662 // unsigned char ch = str[begin];
663 // if (ch > ' ') {
664 // break;
665 // } else {
666 // ++begin;
667 // }
668 //}
669
670 //// we're done if we trimmed the string to nothing
671 //if (begin == strlen) {
672 // str.erase();
673 // return;
674 //}
675
676 //// trim junk off the end (while we're at it, record whether we're chopping off a period)
677 //size_t end = strlen - 1;
678 //bool has_period = false;
679 //while (end > begin) {
680 // unsigned char ch = str[end];
681 // if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
682 // has_period = (has_period || ch == '.');
683 // --end;
684 // } else {
685 // break;
686 // }
687 //}
688
689 //// check whether we're about to chop off an ellipsis, so we remember to add it back
690 //// TODO: There's got to be a more efficient way of doing this
691 //const bool weChoppedOffAnEllipsis = ( NPOS != NStr::Find(str, "...", end) );
692
693 //// do the actual chopping here
694 //str = str.substr( begin, end + 1 );
695
696 //// restore chopped off ellipsis or period, if any
697 //if ( allow_ellipsis && weChoppedOffAnEllipsis ) {
698 // str += "...";
699 //} else if (has_period) {
700 // // re-add any periods if we had one before
701 // str += '.';
702 //}
703
704 // This is based on the C function TrimSpacesAndJunkFromEnds.
705 // Although it's updated to use iterators and such and to
706 // return whether it changed the string, it should
707 // have the same output, except:
708 // - We do NOT chop off a semicolon if we determine that it's
709 // part of an HTML escape char (e.g. "&bgr;" ).
710 // - There are some changes in how tildes are handled;
711 // this algo is less likely to remove them.
712
713 if (str.empty()) {
714 result.clear();
715 return;
716 }
717
718 // make start_of_junk_pos hold the beginning of the "junk" at the end
719 // (where junk is defined as one of several characters)
720 // while we're at it, also check if the junk contains a tilde and/or period
721 bool isPeriod = false;
722 bool isTilde = false;
723 size_t start_of_junk_pos = 0;
724 for (size_t len = str.length(); len && start_of_junk_pos == 0; len--)
725 {
726 char ch = str[len-1];
727 if (ch <= ' ') ch = ' ';
728 switch (ch)
729 {
730 case '.':
731 isPeriod = true;
732 break;
733 case '~':
734 isTilde = true;
735 break;
736 case ';':
737 case ',':
738 case ' ':
739 break;
740 default:
741 // found non-junk character. Last junk character is just after this
742 start_of_junk_pos = len;
743 break;
744 }
745 }
746
747 // check for ';' that's part of an HTML escape char like "&bgr;" and
748 // skip over it (i.e., don't remove it) if so
749 if (start_of_junk_pos < str.length() && str[start_of_junk_pos] == ';') {
750 // we assume no HTML escape char will be longer than this
751 static const int kMaxCharsToLookAt = 20;
752
753 // go backwards, looking for the ampersand
754 int amp_iter = ((int)start_of_junk_pos - 1);
755 for (; amp_iter >= 0 && ((start_of_junk_pos - amp_iter) < kMaxCharsToLookAt); --amp_iter) {
756 const unsigned char ch = str[amp_iter];
757 if (isalnum(ch) || ch == '#') {
758 // just keep going
759 }
760 else if (ch == '&') {
761 // The semicolon ends an HTML escape character, so we skip it
762 ++start_of_junk_pos;
763 break;
764 }
765 else {
766 // The semicolon does NOT end an HTML escape character, so we might remove it
767 break;
768 }
769 }
770 }
771
772 // holds the suffix to add after we remove the junk
773 CTempString suffix; // by default, just remove junk
774
775 // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
776 if (start_of_junk_pos < str.length()) {
777
778 const int chars_in_junk = (int)(str.length() - start_of_junk_pos);
779 _ASSERT(chars_in_junk >= 1);
780
781 // allow one period at end
782 if (isPeriod) {
783 // check if we should put an ellipsis, or just a period
784 const bool putEllipsis = (allow_ellipsis && (chars_in_junk >= 3) &&
785 str[start_of_junk_pos + 1] == '.' && str[start_of_junk_pos + 2] == '.');
786
787 suffix = (putEllipsis ? "..." : ".");
788 }
789 else if (isTilde) {
790 // allow tilde(s)
791 // (This should work on single- AND double-tildes because
792 // we don't know whether or not tilde-expansion was called before this
793 // point )
794 if (str[start_of_junk_pos] == '~') {
795 const bool doubleTilde = ((chars_in_junk >= 2) && str[start_of_junk_pos + 1] == '~');
796 suffix = (doubleTilde ? "~~" : "~");
797 }
798 }
799 }
800 const char* ptr = str.data();
801 size_t len = start_of_junk_pos;
802 while (len && *ptr <= ' ')
803 {
804 len--; ptr++;
805 }
806 result.reserve(len + suffix.length());
807 result.assign(ptr, len);
808 result.append(suffix.data(), suffix.length());
809 }
810
811 // two-bytes combinations we're looking to clean
812 #define twochars(a,b) Uint2((a) << 8 | (b))
813 #define twocommas twochars(',',',')
814 #define twospaces twochars(' ',' ')
815 #define twosemicolons twochars(';',';')
816 #define space_comma twochars(' ',',')
817 #define space_bracket twochars(' ',')')
818 #define bracket_space twochars('(',' ')
819 #define space_semicolon twochars(' ',';')
820 #define comma_space twochars(',',' ')
821 #define semicolon_space twochars(';',' ')
822
CleanAndCompress(string & dest,const CTempString & instr)823 void CleanAndCompress(string& dest, const CTempString& instr)
824 {
825 size_t left = instr.size();
826 // this is the input stream
827 const char* in = instr.data();
828
829 // skip front white spaces
830 while (left && *in == ' ')
831 {
832 in++;
833 left--;
834 }
835 // forget end white spaces
836 while (left && in[left - 1] == ' ')
837 {
838 left--;
839 }
840
841 dest.resize(left);
842
843 if (left < 1) return;
844
845 // this is where we write result
846 char* out = (char*)dest.c_str();
847
848 char curr = *in++; // initialize with first character
849 left--;
850
851 char next = 0;
852 Uint2 two_chars = curr; // this is two bytes storage where we see current and previous symbols
853
854 while (left > 0) {
855 next = *in++;
856
857 two_chars = Uint2((two_chars << 8) | next);
858
859 switch (two_chars)
860 {
861 case twocommas: // replace double commas with comma+space
862 *out++ = curr;
863 next = ' ';
864 break;
865 case twospaces: // skip multiple spaces (only print last one)
866 break;
867 case twosemicolons: // skip multiple semicolons (only print last one)
868 break;
869 case bracket_space: // skip space after bracket
870 next = curr;
871 two_chars = curr;
872 break;
873 case space_bracket: // skip space before bracket
874 break;
875 case space_comma:
876 *out++ = next;
877 next = curr;
878 *out++ = ' ';
879 while ((next == ' ' || next == ',') && left > 0) {
880 next = *in;
881 in++;
882 left--;
883 }
884 two_chars = next;
885 break;
886 case space_semicolon:
887 *out++ = next;
888 next = curr;
889 *out++ = ' ';
890 while ((next == ' ' || next == ';') && left > 0) {
891 next = *in;
892 in++;
893 left--;
894 }
895 two_chars = next;
896 break;
897 case comma_space:
898 *out++ = curr;
899 *out++ = ' ';
900 while ((next == ' ' || next == ',') && left > 0) {
901 next = *in;
902 in++;
903 left--;
904 }
905 two_chars = next;
906 break;
907 case semicolon_space:
908 *out++ = curr;
909 *out++ = ' ';
910 while ((next == ' ' || next == ';') && left > 0) {
911 next = *in;
912 in++;
913 left--;
914 }
915 two_chars = next;
916 break;
917 default:
918 *out++ = curr;
919 break;
920 }
921
922 curr = next;
923 if (left > 0) {
924 left--;
925 }
926 }
927
928 if (curr > 0 && curr != ' ') {
929 *out++ = curr;
930 }
931
932 dest.resize(out - dest.c_str());
933 }
934
935 #if 0
936 struct CleanAndCompress_unit_test
937 {
938 CleanAndCompress_unit_test()
939 {
940 test("C( )C");
941 test("xx,,xx");
942 test("xx,, xx");
943 test("xx,, xx");
944 test(" xx xx ");
945 test("xx , xx");
946 test("xx , xx");
947 test("xx(xx)");
948 test("xx( xx )");
949 }
950 void test(char* s)
951 {
952 string str;
953 CleanAndCompress(str, s);
954 cout << s << "--->" << str << '.' << endl;
955 }
956 };
957
958 CleanAndCompress_unit_test t;
959 #endif
960
961
962 /*
963 void CleanAndCompress (string& str)
964 {
965 if (str.empty()) {
966 return;
967 }
968
969 size_t pos = str.find (" ,");
970 if (pos != NPOS) {
971 str [pos] = ',';
972 str [pos+1] = ' ';
973 }
974 pos = str.find (",,");
975 if (pos != NPOS) {
976 str [pos+1] = ' ';
977 }
978 pos = str.find (" ;");
979 if (pos != NPOS) {
980 str [pos] = ';';
981 str [pos+1] = ' ';
982 }
983 pos = str.find ("( ");
984 if (pos != NPOS) {
985 str [pos] = ' ';
986 str [pos+1] = '(';
987 }
988 pos = str.find (" )");
989 if (pos != NPOS) {
990 str [pos] = ')';
991 str [pos+1] = ' ';
992 }
993
994 string::iterator end = str.end();
995 string::iterator it = str.begin();
996 string::iterator new_str = it;
997 while (it != end) {
998 *new_str++ = *it;
999 if ( (*it == ' ') || (*it == '\t') || (*it == '(') ) {
1000 for (++it; (it != end) && (*it == ' ' || *it == '\t'); ++it) continue;
1001 if ((it != end) && (*it == ')' || *it == ',') ) {
1002 // this "if" protects against the case "(...bunch of spaces and tabs...)".
1003 // Otherwise, the first '(' is unintentionally erased
1004 if( *(new_str - 1) != '(' ) {
1005 --new_str;
1006 }
1007 }
1008 } else {
1009 ++it;
1010 }
1011 }
1012 str.erase(new_str, str.end());
1013 }
1014 */
1015
1016
1017 #if 0
1018 struct CJunkUnitTest
1019 {
1020 void test(CTempString v, bool a_e)
1021 {
1022 string res(v);
1023 TrimSpacesAndJunkFromEnds(res, a_e);
1024 TrimSpacesAndJunkFromEnds(res, v, a_e);
1025 }
1026 CJunkUnitTest()
1027 {
1028 test(" .", true);
1029 test(" aaa bbb.....", true);
1030 test(" aaa bbb.....", false);
1031 test(" aaa bbb~~~~~", true);
1032 test(" aaa bbb,,,,,", true);
1033 test(" aaa bbb;;;;;;", true);
1034 }
1035 };
1036
1037 static CJunkUnitTest c;
1038 #endif
1039
s_IsWholeWord(const string & str,size_t pos)1040 static bool s_IsWholeWord(const string& str, size_t pos)
1041 {
1042 // NB: To preserve the behavior of the C toolkit we only test on the left.
1043 // This was an old bug in the C toolkit that was never fixed and by now
1044 // has become the expected behavior.
1045 return (pos > 0 && pos <= str.size()) ?
1046 isspace((unsigned char) str[pos - 1]) || ispunct((unsigned char) str[pos - 1]) : true;
1047 }
1048
1049
JoinString(string & to,const string & prefix,const string & str,bool noRedundancy)1050 void JoinString(string& to, const string& prefix, const string& str, bool noRedundancy)
1051 {
1052 if ( str.empty() ) {
1053 return;
1054 }
1055
1056 if ( to.empty() ) {
1057 to += str;
1058 return;
1059 }
1060
1061 size_t pos = NPOS;
1062 if (noRedundancy) {
1063 //for ( pos = NStr::Find(to, str); pos != NPOS; pos += str.length()) {
1064 for ( pos = NStr::Find(to, str);
1065 pos != NPOS; pos = NStr::Find(to, str, pos + 1)) {
1066 if (s_IsWholeWord(to, pos)) {
1067 return;
1068 }
1069 }
1070 }
1071
1072 //LOG_POST(Error << "adding: to=" << to << " prefix=" << prefix << " str=" << str);
1073
1074 if( NStr::StartsWith(prefix, ";") && NStr::EndsWith(to, ";") ) {
1075 to += prefix.substr(1);
1076 } else {
1077 to += prefix;
1078 }
1079 to += str;
1080 }
1081
1082
JoinString(const list<string> & l,const string & delim,bool noRedundancy)1083 string JoinString(const list<string>& l, const string& delim, bool noRedundancy)
1084 {
1085 if ( l.empty() ) {
1086 return kEmptyStr;
1087 }
1088
1089 /**
1090 string result;
1091 set<CTempString> strings;
1092 ITERATE (list<string>, it, l) {
1093 if ( !noRedundancy ||
1094 strings.insert(CTempString(*it)).second) {
1095 if ( !result.empty() ) {
1096 result += delim;
1097 }
1098 result += *it;
1099 }
1100 }
1101 **/
1102
1103 string result = l.front();
1104 list<string>::const_iterator it = l.begin();
1105 while ( ++it != l.end() ) {
1106 JoinString(result, delim, *it, noRedundancy);
1107 }
1108
1109 return result;
1110 }
1111
1112
1113 // Validate the correct format of an accession string.
s_IsValidAccession(const string & acc)1114 static bool s_IsValidAccession(const string& acc)
1115 {
1116 static const size_t kMaxAccLength = 16;
1117
1118 if ( acc.empty() ) {
1119 return false;
1120 }
1121
1122 if ( acc.length() >= kMaxAccLength ) {
1123 return false;
1124 }
1125
1126 // first character must be uppercase letter
1127 if ( !(isalpha((unsigned char) acc[0]) && isupper((unsigned char) acc[0])) ) {
1128 return false;
1129 }
1130
1131 size_t num_alpha = 0,
1132 num_undersc = 0,
1133 num_digits = 0;
1134
1135 const char* ptr = acc.c_str();
1136 if ( NStr::StartsWith(acc, "NZ_") ) {
1137 ptr += 3;
1138 }
1139 for ( ; isalpha((unsigned char)(*ptr)); ++ptr, ++num_alpha );
1140 for ( ; *ptr == '_'; ++ptr, ++num_undersc );
1141 for ( ; isdigit((unsigned char)(*ptr)); ++ptr, ++num_digits );
1142
1143 if ( (*ptr != '\0') && (*ptr != ' ') && (*ptr != '.') ) {
1144 return false;
1145 }
1146
1147 switch ( num_undersc ) {
1148 case 0:
1149 {{
1150 if ( (num_alpha == 1 && num_digits == 5) ||
1151 (num_alpha == 2 && num_digits == 6) ||
1152 (num_alpha == 3 && num_digits == 5) ||
1153 (num_alpha == 4 && num_digits == 8) ||
1154 (num_alpha == 4 && num_digits == 9) ) {
1155 return true;
1156 }
1157 }}
1158 break;
1159
1160 case 1:
1161 {{
1162 if( num_alpha == 3 && num_digits == 6 &&
1163 NStr::StartsWith(acc, "MAP_") )
1164 {
1165 return true;
1166 }
1167
1168 // RefSeq accession
1169 if ( (num_alpha != 2) ||
1170 (num_digits != 6 && num_digits != 8 && num_digits != 9) ) {
1171 return false;
1172 }
1173
1174 char first_letter = acc[0];
1175 char second_letter = acc[1];
1176
1177 if ( first_letter == 'N' ) {
1178 if ( second_letter == 'C' || second_letter == 'G' ||
1179 second_letter == 'M' || second_letter == 'R' ||
1180 second_letter == 'P' || second_letter == 'W' ||
1181 second_letter == 'T' ) {
1182 return true;
1183 }
1184 } else if ( first_letter == 'X' ) {
1185 if ( second_letter == 'M' || second_letter == 'R' ||
1186 second_letter == 'P' ) {
1187 return true;
1188 }
1189 } else if ( first_letter == 'Z' || first_letter == 'A' ||
1190 first_letter == 'Y' ) {
1191 return (second_letter == 'P');
1192 } else if ( first_letter == 'W' ) {
1193 if ( second_letter == 'P' ) {
1194 return true;
1195 }
1196 }
1197 }}
1198 break;
1199
1200 default:
1201 return false;
1202 }
1203
1204 return false;
1205 }
1206
1207
s_IsValidDotVersion(const string & accn)1208 static bool s_IsValidDotVersion(const string& accn)
1209 {
1210 size_t pos = accn.find('.');
1211 if (pos == NPOS) {
1212 return false;
1213 }
1214 size_t num_digis = 0;
1215 for (++pos; pos < accn.size(); ++pos) {
1216 if (isdigit((unsigned char) accn[pos])) {
1217 ++num_digis;
1218 } else {
1219 return false;
1220 }
1221 }
1222
1223 return (num_digis >= 1);
1224 }
1225
1226
IsValidAccession(const string & accn,EAccValFlag flag)1227 bool IsValidAccession(const string& accn, EAccValFlag flag)
1228 {
1229 // bool valid = s_IsValidAccession(accn);
1230 bool valid = (CSeq_id::IdentifyAccession(accn) != CSeq_id::eAcc_unknown);
1231 if (valid && flag == eValidateAccDotVer) {
1232 valid = s_IsValidDotVersion(accn);
1233 }
1234 return valid;
1235 }
1236
1237
DateToString(const CDate & date,string & str,EDateToString format_choice)1238 void DateToString(const CDate& date, string& str, EDateToString format_choice )
1239 {
1240 // One day we should make regular format default to JAN, since "JUN" seems
1241 // kind of arbitrary.
1242 static const char* regular_format = "%{%2D%|01%}-%{%3N%|JUN%}-%Y";
1243 static const char* cit_sub_format = "%{%2D%|??%}-%{%3N%|???%}-%{%4Y%|/???%}";
1244 static const char* patent_format = "%{%2D%|01%}-%{%3N%|JAN%}-%Y";
1245
1246 const char* format = ( format_choice == eDateToString_cit_sub ?
1247 cit_sub_format :
1248 ( format_choice == eDateToString_patent ? patent_format : regular_format ) );
1249
1250 string date_str;
1251 date.GetDate(&date_str, format);
1252 NStr::ToUpper(date_str);
1253 str.append(date_str);
1254 }
1255
1256
GetDeltaSeqSummary(const CBioseq_Handle & seq,SDeltaSeqSummary & summary)1257 void GetDeltaSeqSummary(const CBioseq_Handle& seq, SDeltaSeqSummary& summary)
1258 {
1259 if ( !seq.IsSetInst() ||
1260 !seq.IsSetInst_Repr() ||
1261 !(seq.GetInst_Repr() == CSeq_inst::eRepr_delta) ||
1262 !seq.IsSetInst_Ext() ||
1263 !seq.GetInst_Ext().IsDelta() ) {
1264 return;
1265 }
1266
1267 SDeltaSeqSummary temp;
1268 CScope& scope = seq.GetScope();
1269
1270 const CDelta_ext::Tdata& segs = seq.GetInst_Ext().GetDelta().Get();
1271 temp.num_segs = segs.size();
1272
1273 size_t len = 0;
1274
1275 CNcbiOstrstream text;
1276
1277 CDelta_ext::Tdata::const_iterator curr = segs.begin();
1278 CDelta_ext::Tdata::const_iterator end = segs.end();
1279 CDelta_ext::Tdata::const_iterator next;
1280 for ( ; curr != end; curr = next ) {
1281 {{
1282 // set next to one after curr
1283 next = curr; ++next;
1284 }}
1285 size_t from = len + 1;
1286 switch ( (*curr)->Which() ) {
1287 case CDelta_seq::e_Loc:
1288 {{
1289 const CDelta_seq::TLoc& loc = (*curr)->GetLoc();
1290 if ( loc.IsNull() ) { // gap
1291 ++temp.num_gaps;
1292 text << "* " << from << ' ' << len
1293 << " gap of unknown length~";
1294 } else { // count length
1295 size_t tlen = sequence::GetLength(loc, &scope);
1296 len += tlen;
1297 temp.residues += tlen;
1298 text << "* " << setw(8) << from << ' ' << setw(8) << len
1299 << ": contig of " << tlen << " bp in length~";
1300 }
1301 }}
1302 break;
1303 case CDelta_seq::e_Literal:
1304 {{
1305 const CDelta_seq::TLiteral& lit = (*curr)->GetLiteral();
1306 size_t lit_len = lit.CanGetLength() ? lit.GetLength() : 0;
1307 len += lit_len;
1308 if ( lit.CanGetSeq_data() && lit.GetSeq_data().Which() != CSeq_data::e_Gap ) {
1309 temp.residues += lit_len;
1310 while ( next != end && (*next)->IsLiteral() &&
1311 (*next)->GetLiteral().CanGetSeq_data() &&
1312 (*next)->GetLiteral().GetSeq_data().Which() != CSeq_data::e_Gap ) {
1313 const CDelta_seq::TLiteral& next_lit = (*next)->GetLiteral();
1314 size_t next_len = next_lit.CanGetLength() ?
1315 next_lit.GetLength() : 0;
1316 lit_len += next_len;
1317 len += next_len;
1318 temp.residues += next_len;
1319 ++next;
1320 }
1321 text << "* " << setw(8) << from << ' ' << setw(8) << len
1322 << ": contig of " << lit_len << " bp in length~";
1323 } else {
1324 bool unk = false;
1325 ++temp.num_gaps;
1326 if ( lit.CanGetFuzz() ) {
1327 const CSeq_literal::TFuzz& fuzz = lit.GetFuzz();
1328 if ( fuzz.IsLim() &&
1329 fuzz.GetLim() == CInt_fuzz::eLim_unk ) {
1330 unk = true;
1331 ++temp.num_faked_gaps;
1332 if ( from > len ) {
1333 text << "* gap of unknown length~";
1334 } else {
1335 text << "* " << setw(8) << from << ' ' << setw(8) << len
1336 << ": gap of unknown length~";
1337 }
1338 }
1339 }
1340 if ( !unk ) {
1341 text << "* " << setw(8) << from << " " << setw(8) << len
1342 << ": gap of " << lit_len << " bp~";
1343 }
1344 }
1345 }}
1346 break;
1347
1348 default:
1349 break;
1350 }
1351 }
1352 summary = temp;
1353 summary.text = CNcbiOstrstreamToString(text);
1354 }
1355
1356
1357 SAFE_CONST_STATIC_STRING(kTS_concept_trans, "conceptual translation");
1358 SAFE_CONST_STATIC_STRING(kTS_concept_trans_a, "conceptual translation supplied by author");
1359 SAFE_CONST_STATIC_STRING(kTS_both, "conceptual translation with partial peptide sequencing");
1360 SAFE_CONST_STATIC_STRING(kTS_seq_pept, "direct peptide sequencing");
1361 SAFE_CONST_STATIC_STRING(kTS_seq_pept_homol, "sequenced peptide, ordered by homology");
1362 SAFE_CONST_STATIC_STRING(kTS_seq_pept_overlap, "sequenced peptide, ordered by overlap");
1363
GetTechString(int tech)1364 const string& GetTechString(int tech)
1365 {
1366
1367 switch ( tech ) {
1368 case CMolInfo::eTech_concept_trans:
1369 return kTS_concept_trans.Get();
1370
1371 case CMolInfo::eTech_seq_pept :
1372 return kTS_seq_pept.Get();
1373
1374 case CMolInfo::eTech_both:
1375 return kTS_both.Get();
1376
1377 case CMolInfo::eTech_seq_pept_overlap:
1378 return kTS_seq_pept_overlap.Get();
1379
1380 case CMolInfo::eTech_seq_pept_homol:
1381 return kTS_seq_pept_homol.Get();
1382
1383 case CMolInfo::eTech_concept_trans_a:
1384 return kTS_concept_trans_a.Get();
1385
1386 default:
1387 return kEmptyStr;
1388 }
1389
1390 return kEmptyStr;
1391 }
1392
1393
s_IsModelEvidanceUop(const CUser_object & uo)1394 bool s_IsModelEvidanceUop(const CUser_object& uo)
1395 {
1396 return (uo.CanGetType() && uo.GetType().IsStr() &&
1397 uo.GetType().GetStr() == "ModelEvidence");
1398 }
1399
1400
s_FindModelEvidanceUop(const CUser_object & uo)1401 const CUser_object* s_FindModelEvidanceUop(const CUser_object& uo)
1402 {
1403 if ( s_IsModelEvidanceUop(uo) ) {
1404 return &uo;
1405 }
1406
1407 const CUser_object* temp = 0;
1408 ITERATE (CUser_object::TData, ufi, uo.GetData()) {
1409 const CUser_field& uf = **ufi;
1410 if ( !uf.CanGetData() ) {
1411 continue;
1412 }
1413 const CUser_field::TData& data = uf.GetData();
1414
1415 switch ( data.Which() ) {
1416 case CUser_field::TData::e_Object:
1417 temp = s_FindModelEvidanceUop(data.GetObject());
1418 break;
1419
1420 case CUser_field::TData::e_Objects:
1421 ITERATE (CUser_field::TData::TObjects, obj, data.GetObjects()) {
1422 temp = s_FindModelEvidanceUop(**obj);
1423 if ( temp != 0 ) {
1424 break;
1425 }
1426 }
1427 break;
1428
1429 default:
1430 break;
1431 }
1432 if ( temp != 0 ) {
1433 break;
1434 }
1435 }
1436
1437 return temp;
1438 }
1439
1440
s_GetModelEvidance(const CBioseq_Handle & bsh,SModelEvidance & me)1441 bool s_GetModelEvidance(const CBioseq_Handle& bsh, SModelEvidance& me)
1442 {
1443 CConstRef<CUser_object> moduop;
1444 bool result = false;
1445
1446 for (CSeqdesc_CI it(bsh, CSeqdesc::e_User); it; ++it) {
1447 moduop.Reset(s_FindModelEvidanceUop(it->GetUser()));
1448 if (moduop.NotEmpty()) {
1449 result = true;
1450 CConstRef<CUser_field> ufp;
1451 if( moduop->HasField("Contig Name") ) {
1452 ufp = &(moduop->GetField("Contig Name"));
1453 if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsStr() ) {
1454 me.name = ufp->GetData().GetStr();
1455 }
1456 }
1457 if( moduop->HasField("Assembly") ) {
1458 ufp = &(moduop->GetField("Assembly"));
1459 if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsFields() ) {
1460 ITERATE(CUser_field::C_Data::TFields, fld_itr, ufp->GetData().GetFields()) {
1461 const CUser_field& field = **fld_itr;
1462 ITERATE(CUser_field::C_Data::TFields, inr_itr, field.GetData().GetFields()) {
1463 const CUser_field& ufld = **inr_itr;
1464 if ( !ufld.IsSetLabel() || !ufld.GetLabel().IsStr() ) continue;
1465 const string& label = ufld.GetLabel().GetStr();
1466 if (label != "accession") continue;
1467 const CUser_field::C_Data& data = ufld.GetData();
1468 if (data.IsStr()) {
1469 const string& accn = data.GetStr();
1470 me.assembly.push_back(accn);
1471 }
1472 }
1473 }
1474 }
1475 }
1476 if ( moduop->HasField("Method") ) {
1477 ufp = &(moduop->GetField("Method"));
1478 if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsStr() ) {
1479 me.method = ufp->GetData().GetStr();
1480 }
1481 }
1482 if ( moduop->HasField("Counts") ) {
1483 ufp = &(moduop->GetField("Counts"));
1484 if ( ufp->HasField("mRNA")) {
1485 me.mrnaEv = true;
1486 }
1487 if ( ufp->HasField("EST")) {
1488 me.estEv = true;
1489 }
1490 }
1491 if ( moduop->HasField("mRNA") ) {
1492 me.mrnaEv = true;
1493 }
1494 if ( moduop->HasField("EST") ) {
1495 me.estEv = true;
1496 }
1497 if( moduop->HasField("Contig Gi") ) {
1498 ufp = &(moduop->GetField("Contig Gi"));
1499 if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsInt() ) {
1500 me.gi = GI_FROM(CUser_field::C_Data::TInt, ufp->GetData().GetInt());
1501 }
1502 }
1503 if( moduop->HasField("Contig Span") ) {
1504 ufp = &(moduop->GetField("Contig Span"));
1505 if ( ufp.NotEmpty() && ufp->IsSetData() && ufp->GetData().IsInts()
1506 && ufp->IsSetNum() && ufp->GetNum() == 2 && ufp->GetData().GetInts().size() == 2 )
1507 {
1508 const CUser_field::C_Data::TInts & int_list = ufp->GetData().GetInts();
1509 me.span.first = int_list[0];
1510 me.span.second = int_list[1];
1511 }
1512 }
1513 }
1514 }
1515
1516 // if me.name is missing version, try to update from me.gi
1517 if( me.gi > ZERO_GI && me.name.find('.') == string::npos ) {
1518 CSeq_id_Handle accver_idh = bsh.GetScope().GetAccVer( CSeq_id_Handle::GetGiHandle(me.gi) );
1519 if( accver_idh ) {
1520 CConstRef<CSeq_id> accver_seq_id = accver_idh.GetSeqIdOrNull();
1521 if( accver_seq_id ) {
1522 const CTextseq_id *text_id = accver_seq_id->GetTextseq_Id();
1523 if( text_id && text_id->IsSetAccession() && text_id->IsSetVersion() ) {
1524 me.name = text_id->GetAccession() + "." + NStr::IntToString(text_id->GetVersion());
1525 }
1526 }
1527 }
1528 }
1529
1530 return result;
1531 }
1532
1533
GetModelEvidance(const CBioseq_Handle & bsh,SModelEvidance & me)1534 bool GetModelEvidance(const CBioseq_Handle& bsh, SModelEvidance& me)
1535 {
1536 if ( s_GetModelEvidance(bsh, me) ) {
1537 return true;
1538 }
1539
1540 if ( CSeq_inst::IsAa(bsh.GetInst_Mol()) ) {
1541 CBioseq_Handle nuc = sequence::GetNucleotideParent(bsh);
1542 if ( nuc ) {
1543 return s_GetModelEvidance(nuc, me);
1544 }
1545 }
1546
1547 return false;
1548 }
1549
1550
1551 // in Ncbistdaa order
1552 static const char* kAANames[] = {
1553 "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
1554 "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val",
1555 "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
1556 };
1557
1558
GetAAName(unsigned char aa,bool is_ascii)1559 const char* GetAAName(unsigned char aa, bool is_ascii)
1560 {
1561 if (is_ascii) {
1562 aa = (unsigned char)
1563 CSeqportUtil::GetMapToIndex(CSeq_data::e_Ncbieaa,
1564 CSeq_data::e_Ncbistdaa, aa);
1565 }
1566 return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
1567 }
1568
1569 //////////////////////////////////////////////////////////////////////////////
1570
GetResolveOrder(CScope & scope,const CSeq_id_Handle & mrna,const CSeq_id_Handle & prot,CBioseq_Handle & mrna_bsh,CBioseq_Handle & prot_bsh)1571 EResolveOrder GetResolveOrder(CScope& scope,
1572 const CSeq_id_Handle& mrna,
1573 const CSeq_id_Handle& prot,
1574 CBioseq_Handle& mrna_bsh,
1575 CBioseq_Handle& prot_bsh)
1576 {
1577 EResolveOrder order = eResolve_NotFound;
1578
1579 if (order == eResolve_NotFound) {
1580 CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
1581 local_scope->AddDefaults();
1582
1583 CBioseq_Handle possible_mrna = local_scope->GetBioseqHandle(mrna);
1584 CBioseq_Handle possible_prot;
1585 if (possible_mrna) {
1586 possible_prot =
1587 possible_mrna.GetTopLevelEntry().GetBioseqHandle(prot);
1588 }
1589 if (possible_mrna && possible_prot) {
1590 order = eResolve_RnaFirst;
1591 }
1592 }
1593
1594 if (order == eResolve_NotFound) {
1595 CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
1596 local_scope->AddDefaults();
1597
1598 CBioseq_Handle possible_prot = local_scope->GetBioseqHandle(prot);
1599 CBioseq_Handle possible_mrna;
1600 if (possible_prot) {
1601 possible_mrna =
1602 possible_prot.GetTopLevelEntry().GetBioseqHandle(mrna);
1603 }
1604
1605 if (possible_mrna && possible_prot) {
1606 order = eResolve_ProtFirst;
1607 }
1608 }
1609
1610 switch (order) {
1611 case eResolve_NotFound:
1612 mrna_bsh = CBioseq_Handle();
1613 prot_bsh = CBioseq_Handle();
1614 break;
1615
1616 case eResolve_RnaFirst:
1617 mrna_bsh = scope.GetBioseqHandle(mrna);
1618 prot_bsh = scope.GetBioseqHandle(prot);
1619 break;
1620
1621 case eResolve_ProtFirst:
1622 prot_bsh = scope.GetBioseqHandle(prot);
1623 mrna_bsh = scope.GetBioseqHandle(mrna);
1624 break;
1625 }
1626
1627 return order;
1628 }
1629
1630 //////////////////////////////////////////////////////////////////////////////
1631 // HTML utils and strings
1632
1633 // ============================================================================
1634 // Link locations:
1635 // ============================================================================
1636 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseNuc =
1637 "https://www.ncbi.nlm.nih.gov/nuccore/";
1638 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseProt =
1639 "https://www.ncbi.nlm.nih.gov/protein/";
1640
1641 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseEntrezViewer =
1642 "https://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val="; // https forwarded to http
1643
1644 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseTaxonomy =
1645 "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?";
1646 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseTransTable =
1647 "https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c#SG";
1648 NCBI_XOBJEDIT_EXPORT const char* strLinkBasePubmed =
1649 "https://www.ncbi.nlm.nih.gov/pubmed/";
1650 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseExpasy =
1651 "https://enzyme.expasy.org/EC/"; // not government site
1652 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseNucSearch =
1653 "https://www.ncbi.nlm.nih.gov/sites/entrez?db=Nucleotide&cmd=Search&term=";
1654 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseGenomePrj =
1655 "https://www.ncbi.nlm.nih.gov/bioproject/";
1656 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseLatLon =
1657 "https://www.ncbi.nlm.nih.gov/projects/Sequin/latlonview.html";
1658 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseGeneOntology =
1659 "http://amigo.geneontology.org/amigo/term/GO:"; // not government site
1660 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseGeneOntologyRef =
1661 "http://www.geneontology.org/cgi-bin/references.cgi#GO_REF:"; // not government site
1662 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseUSPTO =
1663 "https://patft.uspto.gov/netacgi/nph-Parser?patentnumber=";
1664 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseUniProt =
1665 "https://www.uniprot.org/uniprot/";
1666
1667 NCBI_XOBJEDIT_EXPORT const char* strDocLink =
1668 "https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/";
1669
1670 namespace {
1671 // make sure we're not "double-sanitizing"
1672 // (e.g. ">" to "&gt;")
1673 // ============================================================================
1674 template<typename _T>
s_ShouldWeEscapeAmpersand(_T str_iter,const _T & str_iter_end)1675 bool s_ShouldWeEscapeAmpersand(
1676 _T str_iter, // yes, COPY not reference
1677 const _T &str_iter_end)
1678 // ============================================================================
1679 {
1680 _ASSERT(*str_iter == '&');
1681
1682 // This is a long-winded way of checking if str_iter
1683 // is at ">", "<", """ or "&"
1684 // I'm concerned about regexes being too slow.
1685
1686 ++str_iter;
1687 if (str_iter != str_iter_end) {
1688 switch (*str_iter) {
1689 case 'g':
1690 case 'l':
1691 ++str_iter;
1692 if (str_iter != str_iter_end && *str_iter == 't') {
1693 ++str_iter;
1694 if (str_iter != str_iter_end && *str_iter == ';') {
1695 return false;
1696 }
1697 }
1698 break;
1699 case 'a':
1700 ++str_iter;
1701 if (str_iter != str_iter_end && *str_iter == 'm') {
1702 ++str_iter;
1703 if (str_iter != str_iter_end && *str_iter == 'p') {
1704 ++str_iter;
1705 if (str_iter != str_iter_end && *str_iter == ';') {
1706 return false;
1707 }
1708 }
1709 }
1710 break;
1711 case 'q':
1712 ++str_iter;
1713 if (str_iter != str_iter_end && *str_iter == 'u') {
1714 ++str_iter;
1715 if (str_iter != str_iter_end && *str_iter == 'o') {
1716 ++str_iter;
1717 if (str_iter != str_iter_end && *str_iter == 't') {
1718 ++str_iter;
1719 if (str_iter != str_iter_end && *str_iter == ';') {
1720 return false;
1721 }
1722 }
1723 }
1724 }
1725 break;
1726 default:
1727 return true;
1728 }
1729 }
1730 return true;
1731 }
1732
1733 // see if the '<' opens an HTML tag (currently we
1734 // only check for a few kinds of tags )
1735 // ============================================================================
1736 template<typename _T>
s_IsTagStart(const _T & str_iter,const _T & str_iter_end)1737 bool s_IsTagStart(
1738 const _T &str_iter,
1739 const _T &str_iter_end)
1740 // ============================================================================
1741 {
1742 static const char* possible_tag_starts[] = {
1743 "<a href=",
1744 "<acronym title",
1745 "</a>",
1746 "</acronym"
1747 };
1748 static const size_t num_possible_tag_starts =
1749 (sizeof(possible_tag_starts) / sizeof(possible_tag_starts[0]));
1750
1751 // check every string it might start with
1752 for (int possible_str_idx = 0; possible_str_idx < num_possible_tag_starts; ++possible_str_idx) {
1753 const string expected_str = possible_tag_starts[possible_str_idx];
1754
1755 string::size_type idx = 0;
1756 _T check_str_iter = str_iter;
1757 for (; check_str_iter != str_iter_end && idx < expected_str.length(); ++idx, ++check_str_iter) {
1758 if (*check_str_iter != expected_str[idx]) {
1759 break;
1760 }
1761 }
1762
1763 if (idx == expected_str.length()) {
1764 return true;
1765 }
1766 }
1767
1768 // we're in a tag if we matched the whole expected_str
1769 return false;
1770 }
1771
1772 }
1773
ConvertQuotesNotInHTMLTags(string & str)1774 bool ConvertQuotesNotInHTMLTags(string &str)
1775 {
1776 bool changes_made = false;
1777
1778 bool in_tag = false;
1779 size_t idx = 0;
1780 for (; idx < str.length(); ++idx) {
1781 switch (str[idx]) {
1782 case '<':
1783 // heuristic
1784 in_tag = true;
1785 break;
1786 case '>':
1787 in_tag = false;
1788 break;
1789 case '"':
1790 if (!in_tag) {
1791 str[idx] = '\'';
1792 changes_made = true;
1793 }
1794 break;
1795 }
1796 }
1797
1798 return changes_made;
1799 }
1800
1801
1802 // ============================================================================
TryToSanitizeHtml(string & str)1803 void TryToSanitizeHtml(string &str)
1804 {
1805 string result;
1806 // The "* 1.1" should keep up efficient in most cases since data tends not to have
1807 // too many characters that need escaping.
1808 result.reserve(1 + (int)((double)str.length() * 1.1));
1809 TryToSanitizeHtml(result, str);
1810
1811 // swap is faster than assignment
1812 str.swap(result);
1813 }
1814
TryToSanitizeHtml(std::string & result,const CTempString & str)1815 void TryToSanitizeHtml(std::string &result, const CTempString& str)
1816 // ============================================================================
1817 {
1818 result.clear();
1819
1820 // we only sanitize when we're not in an url
1821 bool in_html_tag = false;
1822 ITERATE(CTempString, str_iter, str) {
1823 // see if we're entering an HTML tag
1824 if (!in_html_tag && *str_iter == '<' && s_IsTagStart(str_iter, str.end())) {
1825 in_html_tag = true;
1826 }
1827
1828 // now that we know whether we're in a tag,
1829 // process characters appropriately.
1830 if (in_html_tag) {
1831 switch (*str_iter) {
1832 case '&':
1833 // make sure we're not "double-sanitizing"
1834 // (e.g. ">" to "&gt;")
1835 if (s_ShouldWeEscapeAmpersand(str_iter, str.end())) {
1836 result += "&";
1837 }
1838 else {
1839 result += '&';
1840 }
1841 break;
1842 default:
1843 result += *str_iter;
1844 break;
1845 }
1846 }
1847 else {
1848 switch (*str_iter) {
1849 case '<':
1850 result += "<";
1851 break;
1852 case '>':
1853 result += ">";
1854 break;
1855 default:
1856 result += *str_iter;
1857 break;
1858 }
1859 }
1860
1861 // see if we're exiting an HTML tag
1862 if (in_html_tag && *str_iter == '>') {
1863 // tag is closed now
1864 // (Note: does this consider cases where '>' is in quotes?)
1865 in_html_tag = false;
1866 }
1867 }
1868 }
1869
1870 void
TryToSanitizeHtmlList(std::list<std::string> & strs)1871 TryToSanitizeHtmlList( std::list<std::string> &strs )
1872 {
1873 NON_CONST_ITERATE( std::list<std::string>, str_iter, strs ) {
1874 TryToSanitizeHtml( *str_iter );
1875 }
1876 }
1877
1878 bool
CommentHasSuspiciousHtml(const string & str)1879 CommentHasSuspiciousHtml( const string &str )
1880 {
1881 // list is not complete, still need to take proper precautions
1882 static const char* bad_html_strings[] = {
1883 "<script", "<object", "<applet", "<embed", "<form",
1884 "javascript:", "vbscript:"
1885 };
1886
1887 // load matching fsa if not already done
1888 static CSafeStatic<CTextFsa> fsa;
1889 if( ! fsa->IsPrimed() ) {
1890 for( size_t ii = 0; ii < ArraySize(bad_html_strings); ++ii ) {
1891 fsa->AddWord( bad_html_strings[ii] );
1892 }
1893 fsa->Prime();
1894 }
1895
1896 // do the match
1897 int current_state = 0;
1898 for ( SIZE_TYPE str_idx = 0 ; str_idx < str.length(); ++str_idx) {
1899 const char ch = str[str_idx];
1900 int next_state = fsa->GetNextState (current_state, ch);
1901 if (fsa->IsMatchFound (next_state)) {
1902 return true;
1903 }
1904 current_state = next_state;
1905 }
1906
1907 return false;
1908 }
1909
1910
1911 END_SCOPE(objects)
1912 END_NCBI_SCOPE
1913