1 /*  $Id: objutil.cpp 632875 2021-06-09 14:33:04Z ivanov $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * Author:  Mati Shomrat, NCBI
27 *
28 * File Description:
29 *   shared utility functions
30 *
31 */
32 #include <ncbi_pch.hpp>
33 #include <corelib/ncbistd.hpp>
34 
35 #include <util/strsearch.hpp>
36 
37 #include <objects/general/Date.hpp>
38 #include <objects/general/User_object.hpp>
39 #include <objects/general/User_field.hpp>
40 #include <objects/general/Object_id.hpp>
41 #include <objects/general/Date.hpp>
42 #include <objects/seq/Bioseq.hpp>
43 #include <objects/seq/Seq_inst.hpp>
44 #include <objects/seq/Seq_ext.hpp>
45 #include <objects/seq/Delta_ext.hpp>
46 #include <objects/seq/Delta_seq.hpp>
47 #include <objects/seq/Seq_literal.hpp>
48 #include <objects/seq/MolInfo.hpp>
49 #include <objects/seq/seqport_util.hpp>
50 #include <objects/seqloc/Seq_loc.hpp>
51 #include <objmgr/scope.hpp>
52 #include <objmgr/bioseq_handle.hpp>
53 #include <objmgr/seqdesc_ci.hpp>
54 #include <objmgr/object_manager.hpp>
55 #include <objmgr/util/sequence.hpp>
56 #include <objects/general/general_macros.hpp>
57 #include <algorithm>
58 #include <objmgr/util/objutil.hpp>
59 
60 
61 BEGIN_NCBI_SCOPE
62 BEGIN_SCOPE(objects)
63 
64 
65 SAFE_CONST_STATIC_STRING(kLegalPathChars, "0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz_-.");
66 
IsPartOfUrl(const string & sentence,size_t pos)67 bool IsPartOfUrl(const string& sentence, size_t pos)
68 {
69     string separators( "( \t\r\n" );
70     const string& legal_path_chars = kLegalPathChars.Get();
71 
72     //
73     //  Weed out silly input:
74     //
75     if ( sentence == "" || pos > sentence.length() - 1 ) {
76         return false;
77     }
78     if ( string::npos != separators.find( sentence[ pos ] ) ) {
79         return false;
80     }
81 
82     // Do easy tests first:
83 
84     //  We require the tilde to show up in a pattern like
85     //  "/~[0..9A..Za..z_-.]+". This is inherited from the C toolkit flat file
86     //  generator:
87     //
88     if ( (pos < 1) || (sentence[ pos-1 ] != '/') ) {
89         return false;
90     }
91 
92     //
93     //  Find the start of the "word" that surrounds the given position:
94     //
95     separators += '~';
96     string::size_type left_edge = sentence.find_last_of( separators, pos-1 );
97     if ( left_edge == string::npos ) {
98         left_edge = 0;
99     }
100     else {
101         ++left_edge;
102     }
103 
104     //
105     //  If it's a URL, it better start with a protocol specifier we approve of:
106     //
107     static const char* sc_ProtocolSpecifiers[] = {
108       "URL:",
109       "http:",
110       "https:",
111     };
112     DEFINE_STATIC_ARRAY_MAP_WITH_COPY(CStaticArraySet<string>, vProtocolSpecifiers, sc_ProtocolSpecifiers);
113     size_t colon = sentence.find( ':', left_edge );
114     if ( colon == string::npos ) {
115         return false;
116     }
117     string strMaybeUrl = sentence.substr( left_edge, colon - left_edge + 1 );
118     if ( vProtocolSpecifiers.find( strMaybeUrl ) == vProtocolSpecifiers.end() ) {
119         return false;
120     }
121 
122     ++pos;
123     if ( string::npos == legal_path_chars.find( sentence[ pos ] ) ) {
124         return false;
125     }
126 
127     for ( ++pos; sentence[ pos ] != 0; ++pos ) {
128         if ( string::npos == legal_path_chars.find( sentence[ pos ] ) ) {
129             return ( sentence[ pos ] == '/' );
130         }
131     }
132 
133     return false; /* never found the terminating '/' */
134 };
135 
136 
s_RunOfStars(string & s,SIZE_TYPE start,SIZE_TYPE length)137 static bool s_RunOfStars(string& s, SIZE_TYPE start, SIZE_TYPE length)
138 {
139     SIZE_TYPE max = start + 66;
140     if (max >= length) {
141         return false;
142     }
143     for (SIZE_TYPE i = start; i < max; i++) {
144         if (s[i] != '*') {
145             return false;
146         }
147     }
148     return true;
149 }
150 
151 
ExpandTildes(string & s,ETildeStyle style)152 void ExpandTildes(string& s, ETildeStyle style)
153 {
154     if ( style == eTilde_tilde ) {
155         return;
156     }
157 
158     SIZE_TYPE start = 0, tilde, length = s.length();
159 
160     tilde = s.find('~', start);
161     if (tilde == NPOS) {  // no tilde
162         return;
163     }
164 
165     string result;
166 
167     while ( (start < length)  &&  (tilde = s.find('~', start)) != NPOS ) {
168         result.append(s, start, tilde - start);
169         char next = (tilde + 1) < length ? s[tilde + 1] : 0;
170         switch ( style ) {
171         case eTilde_space:
172             if ( (tilde + 1 < length  &&  isdigit((unsigned char) next) )  ||
173                  (tilde + 2 < length  &&  (next == ' '  ||  next == '(')  &&
174                   isdigit((unsigned char) s[tilde + 2]))) {
175                 result += '~';
176             } else {
177                 result += ' ';
178             }
179             start = tilde + 1;
180             break;
181 
182         case eTilde_newline:
183             if ( tilde + 1 < length  &&  s[tilde + 1] == '~' ) {
184                 result += '~';
185                 start = tilde + 2;
186             } else {
187                 result += "\n";
188                 start = tilde + 1;
189             }
190             break;
191 
192         case eTilde_note:
193             if ( tilde + 1 < length  &&  s[tilde + 1] == '~' ) {
194                 result += '~';
195                 start = tilde + 2;
196             } else {
197                 // plain "~" expands to ";\n", unless it's after a space or semi-colon, in
198                 // which case it becomes a plain "\n"
199                 char prevChar = ( tilde >= 1 ? s[tilde - 1] : '\0' );
200 
201                 if( ' ' == prevChar || ';' == prevChar ) {
202                     result += '\n';
203                 } else {
204                     result += ";\n";
205                 }
206                 start = tilde + 1;
207             }
208             break;
209 
210         case eTilde_comment:
211             if (tilde > 0  &&  s[tilde - 1] == '`') {
212                 result.replace(result.length() - 1, 1, 1,'~');
213             }
214             else if ( IsPartOfUrl( s, tilde ) ) {
215                 result += '~';
216             }
217             else {
218                 result += "\n";
219             }
220             start = tilde + 1;
221             if (s[start] == ' ' && s_RunOfStars(s, start+1, length)) {
222               start++;
223               result += '\n';
224             }
225             break;
226 
227         default: // just keep it, for lack of better ideas
228             result += '~';
229             start = tilde + 1;
230             break;
231         }
232     }
233     if (start < length) {
234         result.append(s, start, NPOS);
235     }
236     s.swap(result);
237 }
238 
239 
ConvertQuotes(string & str)240 void ConvertQuotes(string& str)
241 {
242     replace(str.begin(), str.end(), '\"', '\'');
243 }
244 
245 
ConvertQuotes(const string & str)246 string ConvertQuotes(const string& str)
247 {
248     string retval = str;
249     ConvertQuotes(retval);
250     return retval;
251 }
252 
253 // Strips all spaces in string in following manner. If the function
254 // meet several spaces (spaces and tabs) in succession it replaces them
255 // with one space. Strips all spaces after '(' and before ( ')' or ',' ).
StripSpaces(string & str)256 bool StripSpaces(string& str)
257 {
258     if (str.empty()) {
259         return false;
260     }
261     auto orig_len = str.length();
262 
263     NStr::ReplaceInPlace(str, "\t", " ");
264     auto this_len = str.length();
265     NStr::ReplaceInPlace(str, "  ", " ");
266     while (str.length() != this_len) {
267         this_len = str.length();
268         NStr::ReplaceInPlace(str, "  ", " ");
269     }
270     NStr::ReplaceInPlace(str, "( ", "(");
271     NStr::ReplaceInPlace(str, " )", ")");
272     NStr::ReplaceInPlace(str, " ,", ",");
273 
274 #if 0
275 
276     string::iterator end = str.end();
277     string::iterator it = str.begin();
278     string::iterator new_str = it;
279     while (it != end) {
280         *new_str++ = *it;
281         if ( (*it == ' ')  ||  (*it == '\t')  ||  (*it == '(') ) {
282             for (++it; it != end && (*it == ' ' || *it == '\t'); ++it)
283                 continue;
284             if (it != end && (*it == ')' || *it == ',')) {
285                 if( *(new_str - 1) != '(' ) { // this if protects against the case "(...bunch of spaces and tabs...)".  Otherwise, the first '(' is erased
286                     --new_str;
287                 }
288             }
289         } else {
290             ++it;
291         }
292     }
293     str.erase(new_str, str.end());
294 #endif
295     return (orig_len != str.length());
296 }
297 
298 
RemovePeriodFromEnd(string & str,bool keep_ellipsis)299 bool RemovePeriodFromEnd(string& str, bool keep_ellipsis)
300 {
301 
302     // NB: this is likely a better solution; however, the C toolkit differs...
303     //string::size_type pos = str.find_last_not_of(".,;:() ");
304     // string::size_type pos = str.find_last_not_of(".,;: ");
305     //string::size_type pos = str.find_last_not_of(".");
306     //string::size_type pos2 = str.find("...", pos);
307     //// string::size_type pos3 = str.find_first_of(".", pos);
308     //if (pos < str.size() - 1) {
309     //    str.erase(pos + 1);
310     //    if (keep_ellipsis  &&  pos2 != string::npos) {
311     //        str += "...";
312     //    }
313     //}
314     //return ( pos != string::npos );
315 
316     const string::size_type len = str.length();
317 
318     if( keep_ellipsis ) {
319         if( len >= 3 && str[len-1] == '.' && str[len-2] == '.' && str[len-3] == '.' ) {
320             return false;
321         }
322     }
323 
324     // chop off period if there's one at the end
325     if( len >= 1 && str[len-1] == '.' ) {
326         str.resize( len - 1 );
327         return true;
328     } else {
329         return false;
330     }
331 
332     /* string::size_type pos2 = str.find_last_not_of(";,.");
333     string::size_type pos3 = str.find_last_not_of(" ", pos2);
334     if (pos3 < pos2) {
335         str.erase(pos3 + 1);
336         pos2 = str.find_last_not_of(";,.");
337     }
338 
339     string::size_type pos = str.find_last_not_of(".");
340     if (pos2 < str.size() - 1) {
341         if (keep_ellipsis) {
342             /// trim the end to an actual ellipsis
343             if (str.length() - pos2 > 3) {
344                 if (pos2 < pos) {
345                     str.erase(pos2 + 1);
346                     str += "...";
347                     return true;
348                 }
349                 pos += 3;
350             }
351             else if (pos2 < pos) {
352                 pos = pos2;
353             }
354         } else if (pos2 < pos) {
355             pos = pos2;
356         }
357         if (pos < str.size() - 1) {
358             str.erase(pos + 1);
359             return true;
360         }
361     } */
362 
363     /**
364     static const char* kEllipsis = "...";
365 
366     if ( NStr::EndsWith(str, '.') ) {
367         if ( !keep_ellipsis  ||  !NStr::EndsWith(str, kEllipsis) ) {
368             str.erase(str.length() - 1);
369             return true;
370         }
371     }
372     **/
373     // return false;
374 }
375 
376 
AddPeriod(string & str)377 void AddPeriod(string& str)
378 {
379     size_t pos = str.find_last_not_of(" \t~.\n");
380     str.erase(pos + 1);
381     str += '.';
382 }
383 
384 
TrimSpaces(string & str,size_t indent)385 void TrimSpaces(string& str, size_t indent)
386 {
387     if (str.empty()  ||  str.length() <= indent) {
388         return;
389     }
390 
391     size_t end = str.length() - 1;
392     while (end >= indent  &&  isspace((unsigned char) str[end])) {
393         end--;
394     }
395     if (end < indent) {
396         str.erase(indent);
397     } else {
398         str.erase(end + 1);
399     }
400 }
401 
402 // needed because not all compilers will just let you pass "isgraph" to STL find_if
403 class CIsGraph
404 {
405 public:
operator ()(const char c)406     bool operator()( const char c ) {
407         return isgraph((unsigned char)c) != 0;
408     }
409 };
410 
411 // This will compress multiple spaces in a row.
412 // It also translates unprintable characters to spaces.
413 // If trim_beginning, strips all spaces and unprintables from beginning of string.
414 // If trim_end, strips all spaces and unprintables from end of string.
415 // returns the string you gave it.
CompressSpaces(string & str,const bool trim_beginning,const bool trim_end)416 string& CompressSpaces( string& str, const bool trim_beginning, const bool trim_end )
417 {
418     if( str.empty() ) {
419         return str;
420     }
421 
422     // set up start_iter and end_iter to determine the range in which we're looking
423 
424     string::iterator start_iter = str.begin();
425     if( trim_beginning ) {
426         start_iter = find_if( str.begin(), str.end(), CIsGraph() );
427     }
428     if( str.end() == start_iter ) {
429         str.clear();
430         return str;
431     }
432 
433     string::iterator end_iter = str.end();
434     if( trim_end ) {
435         string::reverse_iterator rev_iter = find_if( str.rbegin(), str.rend(), CIsGraph() );
436         end_iter = str.begin() + ( str.rend() - rev_iter );
437     }
438     if( str.begin() == end_iter ) {
439         str.clear();
440         return str;
441     }
442 
443     // The main part, where we compress spaces
444     string newstr; // result will end up here
445     newstr.reserve( end_iter - start_iter );
446 
447     // efficiency note: If the efficiency of unique_copy followed by transform becomes
448     // burdensome, we may have to replace these 2 calls with one raw loop that does
449     // what those calls do ( a sloppier and more bug-prone ( but faster ), prospect)
450 
451     // copy such that consecutive spaces or control characters are compressed to one space
452     char last_ch_was_printable = true;
453     for( string::iterator iter = start_iter; iter < end_iter; ++iter ) {
454         const char ch = *iter;
455         if( isgraph(ch) ) {
456             // visible characters get copied straight
457             newstr += ch;
458             last_ch_was_printable = true;
459         } else {
460             // unprintable chars become space, and they're only appended if the last char was
461             // printable
462             if( last_ch_was_printable ) {
463                 newstr += ' ';
464             }
465             last_ch_was_printable = false;
466         }
467     }
468 
469     str.swap( newstr );
470     return str;
471 }
472 
473 
474 // returns true if it changed the string
TrimSpacesAndJunkFromEnds(string & str,bool allow_ellipsis)475 bool TrimSpacesAndJunkFromEnds(string& str, bool allow_ellipsis)
476 {
477     // TODO: This commented out code represents how ellipsis trimming
478     // should work.  However, for compatibility with C, we're using a
479     // (in my opinion) suboptimal algorithm.  We can switch over later.
480 
481     //if (str.empty()) {
482     //    return;
483     //}
484 
485     //size_t strlen = str.length();
486     //size_t begin = 0;
487 
488     //// trim unprintable characters (and space) off the beginning
489     //while (begin != strlen) {
490     //    unsigned char ch = str[begin];
491     //    if (ch > ' ') {
492     //        break;
493     //    } else {
494     //        ++begin;
495     //    }
496     //}
497 
498     //// we're done if we trimmed the string to nothing
499     //if (begin == strlen) {
500     //    str.erase();
501     //    return;
502     //}
503 
504     //// trim junk off the end (while we're at it, record whether we're chopping off a period)
505     //size_t end = strlen - 1;
506     //bool has_period = false;
507     //while (end > begin) {
508     //    unsigned char ch = str[end];
509     //    if (ch <= ' '  ||  ch == '.'  ||  ch ==  ','  ||  ch == '~'  ||  ch == ';') {
510     //        has_period = (has_period  ||  ch == '.');
511     //        --end;
512     //    } else {
513     //        break;
514     //    }
515     //}
516 
517     //// check whether we're about to chop off an ellipsis, so we remember to add it back
518     //// TODO: There's got to be a more efficient way of doing this
519     //const bool weChoppedOffAnEllipsis = ( NPOS != NStr::Find(str, "...", end) );
520 
521     //// do the actual chopping here
522     //str = str.substr( begin, end + 1 );
523 
524     //// restore chopped off ellipsis or period, if any
525     //if ( allow_ellipsis && weChoppedOffAnEllipsis ) {
526     //    str += "...";
527     //} else if (has_period) {
528     //    // re-add any periods if we had one before
529     //    str += '.';
530     //}
531 
532     // This is based on the C function TrimSpacesAndJunkFromEnds.
533     // Although it's updated to use iterators and such and to
534     // return whether it changed the string, it should
535     // have the same output, except:
536     // - We do NOT chop off a semicolon if we determine that it's
537     //   part of an HTML escape char (e.g. "&bgr;" ).
538     // - There are some changes in how tildes are handled;
539     //   this algo is less likely to remove them.
540 
541     if ( str.empty() ) {
542         return false;
543     }
544 
545     // make start_of_junk_pos hold the beginning of the "junk" at the end
546     // (where junk is defined as one of several characters)
547     // while we're at it, also check if the junk contains a tilde and/or period
548     bool isPeriod = false;
549     bool isTilde = false;
550     int start_of_junk_pos = (int)str.length() - 1;
551     for( ; start_of_junk_pos >= 0 ; --start_of_junk_pos ) {
552         const char ch = str[start_of_junk_pos];
553         if (ch <= ' ' || ch == '.' || ch == ',' || ch == '~' || ch == ';') {
554             // found junk character
555 
556             // also, keep track of whether the junk includes a period and/or tilde
557             isPeriod = (isPeriod || ch == '.');
558             isTilde = (isTilde || ch == '~');
559         } else {
560             // found non-junk character.  Last junk character is just after this
561             ++start_of_junk_pos;
562             break;
563         }
564     }
565     // special case of the whole string being junk
566     if( start_of_junk_pos < 0 ) {
567         start_of_junk_pos = 0;
568     }
569 
570     // check for ';' that's part of an HTML escape char like "&bgr;" and
571     // skip over it (i.e., don't remove it) if so
572     if( start_of_junk_pos < (int)str.length() && str[start_of_junk_pos] == ';' ) {
573         // we assume no HTML escape char will be longer than this
574         static const int kMaxCharsToLookAt = 20;
575 
576         // go backwards, looking for the ampersand
577         int amp_iter = (start_of_junk_pos - 1);
578         for( ; amp_iter >= 0 && ((start_of_junk_pos - amp_iter) < kMaxCharsToLookAt); --amp_iter ) {
579             const char ch = str[amp_iter];
580             if( isalnum(ch) || ch == '#' ) {
581                 // just keep going
582             } else if( ch == '&' ) {
583                 // The semicolon ends an HTML escape character, so we skip it
584                 ++start_of_junk_pos;
585                 break;
586             } else {
587                 // The semicolon does NOT end an HTML escape character, so we might remove it
588                 break;
589             }
590         }
591     }
592 
593     bool changed = false;
594 
595     // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
596     if ( start_of_junk_pos < (int)str.length() ) {
597 
598         // holds the suffix to add after we remove the junk
599         const char * suffix = ""; // by default, just remove junk
600 
601         const int chars_in_junk = ( (int)str.length() - start_of_junk_pos );
602         _ASSERT( chars_in_junk >= 1 );
603 
604         // allow one period at end
605         if (isPeriod) {
606             // check if we should put an ellipsis, or just a period
607             const bool putEllipsis = ( allow_ellipsis && (chars_in_junk >= 3) &&
608                 str[start_of_junk_pos+1] == '.' && str[start_of_junk_pos+2] == '.' );
609 
610             suffix = ( putEllipsis ? "..." : "." );
611         } else if (isTilde ) {
612             // allow tilde(s)
613             // (This should work on single- AND double-tildes because
614             // we don't know whether or not tilde-expansion was called before this
615             // point )
616             if ( str[start_of_junk_pos] == '~' ) {
617                 const bool doubleTilde = ( (chars_in_junk >= 2) && str[start_of_junk_pos+1] == '~' );
618                 suffix = ( doubleTilde  ? "~~" : "~" );
619             }
620         }
621         if( suffix[0] != '\0' ) {
622             if( 0 != str.compare( start_of_junk_pos, INT_MAX, suffix) ) {
623                 str.erase( start_of_junk_pos );
624                 str += suffix;
625                 changed = true;
626             }
627         } else if ( start_of_junk_pos < (int)str.length() ) {
628             str.erase( start_of_junk_pos );
629             changed = true;
630         }
631     }
632 
633     // copy the part after the initial whitespace to the destination
634     string::iterator input_iter = str.begin();
635     while ( input_iter != str.end() && *input_iter <= ' ') {
636         ++input_iter;
637     }
638     if( input_iter != str.begin() ) {
639         str.erase( str.begin(), input_iter );
640         changed = true;
641     }
642 
643     return changed;
644 }
645 
646 // this is copy-pasted method and optimized to use CTempString
TrimSpacesAndJunkFromEnds(string & result,const CTempString & str,bool allow_ellipsis)647 void TrimSpacesAndJunkFromEnds(string& result, const CTempString& str, bool allow_ellipsis)
648 {
649     // TODO: This commented out code represents how ellipsis trimming
650     // should work.  However, for compatibility with C, we're using a
651     // (in my opinion) suboptimal algorithm.  We can switch over later.
652 
653     //if (str.empty()) {
654     //    return;
655     //}
656 
657     //size_t strlen = str.length();
658     //size_t begin = 0;
659 
660     //// trim unprintable characters (and space) off the beginning
661     //while (begin != strlen) {
662     //    unsigned char ch = str[begin];
663     //    if (ch > ' ') {
664     //        break;
665     //    } else {
666     //        ++begin;
667     //    }
668     //}
669 
670     //// we're done if we trimmed the string to nothing
671     //if (begin == strlen) {
672     //    str.erase();
673     //    return;
674     //}
675 
676     //// trim junk off the end (while we're at it, record whether we're chopping off a period)
677     //size_t end = strlen - 1;
678     //bool has_period = false;
679     //while (end > begin) {
680     //    unsigned char ch = str[end];
681     //    if (ch <= ' '  ||  ch == '.'  ||  ch ==  ','  ||  ch == '~'  ||  ch == ';') {
682     //        has_period = (has_period  ||  ch == '.');
683     //        --end;
684     //    } else {
685     //        break;
686     //    }
687     //}
688 
689     //// check whether we're about to chop off an ellipsis, so we remember to add it back
690     //// TODO: There's got to be a more efficient way of doing this
691     //const bool weChoppedOffAnEllipsis = ( NPOS != NStr::Find(str, "...", end) );
692 
693     //// do the actual chopping here
694     //str = str.substr( begin, end + 1 );
695 
696     //// restore chopped off ellipsis or period, if any
697     //if ( allow_ellipsis && weChoppedOffAnEllipsis ) {
698     //    str += "...";
699     //} else if (has_period) {
700     //    // re-add any periods if we had one before
701     //    str += '.';
702     //}
703 
704     // This is based on the C function TrimSpacesAndJunkFromEnds.
705     // Although it's updated to use iterators and such and to
706     // return whether it changed the string, it should
707     // have the same output, except:
708     // - We do NOT chop off a semicolon if we determine that it's
709     //   part of an HTML escape char (e.g. "&bgr;" ).
710     // - There are some changes in how tildes are handled;
711     //   this algo is less likely to remove them.
712 
713     if (str.empty()) {
714         result.clear();
715         return;
716     }
717 
718     // make start_of_junk_pos hold the beginning of the "junk" at the end
719     // (where junk is defined as one of several characters)
720     // while we're at it, also check if the junk contains a tilde and/or period
721     bool isPeriod = false;
722     bool isTilde = false;
723     size_t start_of_junk_pos = 0;
724     for (size_t len = str.length(); len && start_of_junk_pos == 0; len--)
725     {
726         char ch = str[len-1];
727         if (ch <= ' ') ch = ' ';
728         switch (ch)
729         {
730           case '.':
731               isPeriod = true;
732               break;
733           case '~':
734               isTilde = true;
735               break;
736           case ';':
737           case ',':
738           case ' ':
739               break;
740           default:
741               // found non-junk character.  Last junk character is just after this
742               start_of_junk_pos = len;
743               break;
744         }
745     }
746 
747     // check for ';' that's part of an HTML escape char like "&bgr;" and
748     // skip over it (i.e., don't remove it) if so
749     if (start_of_junk_pos < str.length() && str[start_of_junk_pos] == ';') {
750         // we assume no HTML escape char will be longer than this
751         static const int kMaxCharsToLookAt = 20;
752 
753         // go backwards, looking for the ampersand
754         int amp_iter = ((int)start_of_junk_pos - 1);
755         for (; amp_iter >= 0 && ((start_of_junk_pos - amp_iter) < kMaxCharsToLookAt); --amp_iter) {
756             const unsigned char ch = str[amp_iter];
757             if (isalnum(ch) || ch == '#') {
758                 // just keep going
759             }
760             else if (ch == '&') {
761                 // The semicolon ends an HTML escape character, so we skip it
762                 ++start_of_junk_pos;
763                 break;
764             }
765             else {
766                 // The semicolon does NOT end an HTML escape character, so we might remove it
767                 break;
768             }
769         }
770     }
771 
772     // holds the suffix to add after we remove the junk
773     CTempString suffix; // by default, just remove junk
774 
775     // if there's junk, chop it off (but leave period/tildes/ellipsis as appropriate)
776     if (start_of_junk_pos < str.length()) {
777 
778         const int chars_in_junk = (int)(str.length() - start_of_junk_pos);
779         _ASSERT(chars_in_junk >= 1);
780 
781         // allow one period at end
782         if (isPeriod) {
783             // check if we should put an ellipsis, or just a period
784             const bool putEllipsis = (allow_ellipsis && (chars_in_junk >= 3) &&
785                 str[start_of_junk_pos + 1] == '.' && str[start_of_junk_pos + 2] == '.');
786 
787             suffix = (putEllipsis ? "..." : ".");
788         }
789         else if (isTilde) {
790             // allow tilde(s)
791             // (This should work on single- AND double-tildes because
792             // we don't know whether or not tilde-expansion was called before this
793             // point )
794             if (str[start_of_junk_pos] == '~') {
795                 const bool doubleTilde = ((chars_in_junk >= 2) && str[start_of_junk_pos + 1] == '~');
796                 suffix = (doubleTilde ? "~~" : "~");
797             }
798         }
799     }
800     const char* ptr = str.data();
801     size_t len = start_of_junk_pos;
802     while (len && *ptr <= ' ')
803     {
804         len--; ptr++;
805     }
806     result.reserve(len + suffix.length());
807     result.assign(ptr, len);
808     result.append(suffix.data(), suffix.length());
809 }
810 
811 // two-bytes combinations we're looking to clean
812 #define twochars(a,b) Uint2((a) << 8 | (b))
813 #define twocommas twochars(',',',')
814 #define twospaces twochars(' ',' ')
815 #define twosemicolons twochars(';',';')
816 #define space_comma twochars(' ',',')
817 #define space_bracket twochars(' ',')')
818 #define bracket_space twochars('(',' ')
819 #define space_semicolon twochars(' ',';')
820 #define comma_space twochars(',',' ')
821 #define semicolon_space twochars(';',' ')
822 
CleanAndCompress(string & dest,const CTempString & instr)823 void CleanAndCompress(string& dest, const CTempString& instr)
824 {
825     size_t left = instr.size();
826     // this is the input stream
827     const char* in = instr.data();
828 
829     // skip front white spaces
830     while (left && *in == ' ')
831     {
832         in++;
833         left--;
834     }
835     // forget end white spaces
836     while (left && in[left - 1] == ' ')
837     {
838         left--;
839     }
840 
841     dest.resize(left);
842 
843     if (left < 1) return;
844 
845     // this is where we write result
846     char* out = (char*)dest.c_str();
847 
848     char curr = *in++; // initialize with first character
849     left--;
850 
851     char next = 0;
852     Uint2 two_chars = curr; // this is two bytes storage where we see current and previous symbols
853 
854     while (left > 0) {
855         next = *in++;
856 
857         two_chars = Uint2((two_chars << 8) | next);
858 
859         switch (two_chars)
860         {
861         case twocommas: // replace double commas with comma+space
862             *out++ = curr;
863             next = ' ';
864             break;
865         case twospaces: // skip multiple spaces (only print last one)
866             break;
867         case twosemicolons: // skip multiple semicolons (only print last one)
868             break;
869         case bracket_space: // skip space after bracket
870             next = curr;
871             two_chars = curr;
872             break;
873         case space_bracket: // skip space before bracket
874             break;
875         case space_comma:
876             *out++ = next;
877             next = curr;
878             *out++ = ' ';
879             while ((next == ' ' || next == ',') && left > 0) {
880                 next = *in;
881                 in++;
882                 left--;
883             }
884             two_chars = next;
885             break;
886         case space_semicolon:
887             *out++ = next;
888             next = curr;
889             *out++ = ' ';
890             while ((next == ' ' || next == ';') && left > 0) {
891                 next = *in;
892                 in++;
893                 left--;
894             }
895             two_chars = next;
896             break;
897         case comma_space:
898             *out++ = curr;
899             *out++ = ' ';
900             while ((next == ' ' || next == ',') && left > 0) {
901                 next = *in;
902                 in++;
903                 left--;
904             }
905             two_chars = next;
906             break;
907         case semicolon_space:
908             *out++ = curr;
909             *out++ = ' ';
910             while ((next == ' ' || next == ';') && left > 0) {
911                 next = *in;
912                 in++;
913                 left--;
914             }
915             two_chars = next;
916             break;
917         default:
918             *out++ = curr;
919             break;
920         }
921 
922         curr = next;
923         if (left > 0) {
924             left--;
925         }
926     }
927 
928     if (curr > 0 && curr != ' ') {
929         *out++ = curr;
930     }
931 
932     dest.resize(out - dest.c_str());
933 }
934 
935 #if 0
936 struct CleanAndCompress_unit_test
937 {
938     CleanAndCompress_unit_test()
939     {
940         test("C( )C");
941         test("xx,,xx");
942         test("xx,, xx");
943         test("xx,,  xx");
944         test("  xx  xx  ");
945         test("xx , xx");
946         test("xx  , xx");
947         test("xx(xx)");
948         test("xx( xx )");
949     }
950     void test(char* s)
951     {
952         string str;
953         CleanAndCompress(str, s);
954         cout << s << "--->" << str << '.' << endl;
955     }
956 };
957 
958 CleanAndCompress_unit_test t;
959 #endif
960 
961 
962 /*
963 void CleanAndCompress (string& str)
964 {
965     if (str.empty()) {
966         return;
967     }
968 
969     size_t pos = str.find (" ,");
970     if (pos != NPOS) {
971         str [pos] = ',';
972         str [pos+1] = ' ';
973     }
974     pos = str.find (",,");
975     if (pos != NPOS) {
976         str [pos+1] = ' ';
977     }
978     pos = str.find (" ;");
979     if (pos != NPOS) {
980         str [pos] = ';';
981         str [pos+1] = ' ';
982     }
983     pos = str.find ("( ");
984     if (pos != NPOS) {
985         str [pos] = ' ';
986         str [pos+1] = '(';
987     }
988     pos = str.find (" )");
989     if (pos != NPOS) {
990         str [pos] = ')';
991         str [pos+1] = ' ';
992     }
993 
994     string::iterator end = str.end();
995     string::iterator it = str.begin();
996     string::iterator new_str = it;
997     while (it != end) {
998         *new_str++ = *it;
999         if ( (*it == ' ')  ||  (*it == '\t')  ||  (*it == '(') ) {
1000             for (++it; (it != end) && (*it == ' ' || *it == '\t'); ++it) continue;
1001             if ((it != end) && (*it == ')' || *it == ',') ) {
1002                 // this "if" protects against the case "(...bunch of spaces and tabs...)".
1003                 // Otherwise, the first '(' is unintentionally erased
1004                 if( *(new_str - 1) != '(' ) {
1005                     --new_str;
1006                 }
1007             }
1008         } else {
1009             ++it;
1010         }
1011     }
1012     str.erase(new_str, str.end());
1013 }
1014 */
1015 
1016 
1017 #if 0
1018 struct CJunkUnitTest
1019 {
1020     void test(CTempString v, bool a_e)
1021     {
1022         string res(v);
1023         TrimSpacesAndJunkFromEnds(res, a_e);
1024         TrimSpacesAndJunkFromEnds(res, v, a_e);
1025     }
1026     CJunkUnitTest()
1027     {
1028         test(" .", true);
1029         test(" aaa bbb.....", true);
1030         test(" aaa bbb.....", false);
1031         test(" aaa bbb~~~~~", true);
1032         test(" aaa bbb,,,,,", true);
1033         test(" aaa bbb;;;;;;", true);
1034     }
1035 };
1036 
1037 static CJunkUnitTest c;
1038 #endif
1039 
s_IsWholeWord(const string & str,size_t pos)1040 static bool s_IsWholeWord(const string& str, size_t pos)
1041 {
1042     // NB: To preserve the behavior of the C toolkit we only test on the left.
1043     // This was an old bug in the C toolkit that was never fixed and by now
1044     // has become the expected behavior.
1045     return (pos > 0  &&  pos <= str.size()) ?
1046         isspace((unsigned char) str[pos - 1])  ||  ispunct((unsigned char) str[pos - 1]) : true;
1047 }
1048 
1049 
JoinString(string & to,const string & prefix,const string & str,bool noRedundancy)1050 void JoinString(string& to, const string& prefix, const string& str, bool noRedundancy)
1051 {
1052     if ( str.empty() ) {
1053         return;
1054     }
1055 
1056     if ( to.empty() ) {
1057         to += str;
1058         return;
1059     }
1060 
1061     size_t pos = NPOS;
1062     if (noRedundancy) {
1063         //for ( pos = NStr::Find(to, str); pos != NPOS; pos += str.length()) {
1064         for ( pos = NStr::Find(to, str);
1065               pos != NPOS;  pos = NStr::Find(to, str, pos + 1)) {
1066             if (s_IsWholeWord(to, pos)) {
1067                 return;
1068             }
1069         }
1070     }
1071 
1072     //LOG_POST(Error << "adding: to=" << to << "  prefix=" << prefix << "  str=" << str);
1073 
1074     if( NStr::StartsWith(prefix, ";") && NStr::EndsWith(to, ";") ) {
1075         to += prefix.substr(1);
1076     } else {
1077         to += prefix;
1078     }
1079     to += str;
1080 }
1081 
1082 
JoinString(const list<string> & l,const string & delim,bool noRedundancy)1083 string JoinString(const list<string>& l, const string& delim, bool noRedundancy)
1084 {
1085     if ( l.empty() ) {
1086         return kEmptyStr;
1087     }
1088 
1089     /**
1090     string result;
1091     set<CTempString> strings;
1092     ITERATE (list<string>, it, l) {
1093         if ( !noRedundancy  ||
1094              strings.insert(CTempString(*it)).second) {
1095             if ( !result.empty() ) {
1096                 result += delim;
1097             }
1098             result += *it;
1099         }
1100     }
1101     **/
1102 
1103     string result = l.front();
1104     list<string>::const_iterator it = l.begin();
1105     while ( ++it != l.end() ) {
1106         JoinString(result, delim, *it, noRedundancy);
1107     }
1108 
1109     return result;
1110 }
1111 
1112 
1113 // Validate the correct format of an accession string.
s_IsValidAccession(const string & acc)1114 static bool s_IsValidAccession(const string& acc)
1115 {
1116     static const size_t kMaxAccLength = 16;
1117 
1118     if ( acc.empty() ) {
1119         return false;
1120     }
1121 
1122     if ( acc.length() >= kMaxAccLength ) {
1123         return false;
1124     }
1125 
1126     // first character must be uppercase letter
1127     if ( !(isalpha((unsigned char) acc[0])  &&  isupper((unsigned char) acc[0])) ) {
1128         return false;
1129     }
1130 
1131     size_t num_alpha   = 0,
1132            num_undersc = 0,
1133            num_digits  = 0;
1134 
1135     const char* ptr = acc.c_str();
1136     if ( NStr::StartsWith(acc, "NZ_") ) {
1137         ptr += 3;
1138     }
1139     for ( ; isalpha((unsigned char)(*ptr)); ++ptr, ++num_alpha );
1140     for ( ; *ptr == '_'; ++ptr, ++num_undersc );
1141     for ( ; isdigit((unsigned char)(*ptr)); ++ptr, ++num_digits );
1142 
1143     if ( (*ptr != '\0')  &&  (*ptr != ' ')  &&  (*ptr != '.') ) {
1144         return false;
1145     }
1146 
1147     switch ( num_undersc ) {
1148     case 0:
1149         {{
1150             if ( (num_alpha == 1  &&  num_digits == 5)  ||
1151                  (num_alpha == 2  &&  num_digits == 6)  ||
1152                  (num_alpha == 3  &&  num_digits == 5)  ||
1153                  (num_alpha == 4  &&  num_digits == 8)  ||
1154                  (num_alpha == 4  &&  num_digits == 9) ) {
1155                 return true;
1156             }
1157         }}
1158         break;
1159 
1160     case 1:
1161         {{
1162             if( num_alpha == 3 && num_digits == 6 &&
1163                 NStr::StartsWith(acc, "MAP_") )
1164             {
1165                 return true;
1166             }
1167 
1168             // RefSeq accession
1169             if ( (num_alpha != 2)  ||
1170                  (num_digits != 6  &&  num_digits != 8  &&  num_digits != 9) ) {
1171                 return false;
1172             }
1173 
1174             char first_letter = acc[0];
1175             char second_letter = acc[1];
1176 
1177             if ( first_letter == 'N' ) {
1178                 if ( second_letter == 'C'  ||  second_letter == 'G'  ||
1179                      second_letter == 'M'  ||  second_letter == 'R'  ||
1180                      second_letter == 'P'  ||  second_letter == 'W'  ||
1181                      second_letter == 'T' ) {
1182                     return true;
1183                 }
1184             } else if ( first_letter == 'X' ) {
1185                 if ( second_letter == 'M'  ||  second_letter == 'R'  ||
1186                      second_letter == 'P' ) {
1187                     return true;
1188                 }
1189             } else if ( first_letter == 'Z'  ||  first_letter == 'A'  ||
1190                         first_letter == 'Y' ) {
1191                 return (second_letter == 'P');
1192             } else if ( first_letter == 'W' ) {
1193                 if ( second_letter == 'P' ) {
1194                     return true;
1195                 }
1196             }
1197         }}
1198         break;
1199 
1200     default:
1201         return false;
1202     }
1203 
1204     return false;
1205 }
1206 
1207 
s_IsValidDotVersion(const string & accn)1208 static bool s_IsValidDotVersion(const string& accn)
1209 {
1210     size_t pos = accn.find('.');
1211     if (pos == NPOS) {
1212         return false;
1213     }
1214     size_t num_digis = 0;
1215     for (++pos; pos < accn.size(); ++pos) {
1216         if (isdigit((unsigned char) accn[pos])) {
1217             ++num_digis;
1218         } else {
1219             return false;
1220         }
1221     }
1222 
1223     return (num_digis >= 1);
1224 }
1225 
1226 
IsValidAccession(const string & accn,EAccValFlag flag)1227 bool IsValidAccession(const string& accn, EAccValFlag flag)
1228 {
1229     // bool valid = s_IsValidAccession(accn);
1230     bool valid = (CSeq_id::IdentifyAccession(accn) != CSeq_id::eAcc_unknown);
1231     if (valid  &&  flag == eValidateAccDotVer) {
1232         valid = s_IsValidDotVersion(accn);
1233     }
1234     return valid;
1235 }
1236 
1237 
DateToString(const CDate & date,string & str,EDateToString format_choice)1238 void DateToString(const CDate& date, string& str, EDateToString format_choice )
1239 {
1240     // One day we should make regular format default to JAN, since "JUN" seems
1241     // kind of arbitrary.
1242     static const char* regular_format = "%{%2D%|01%}-%{%3N%|JUN%}-%Y";
1243     static const char* cit_sub_format = "%{%2D%|??%}-%{%3N%|???%}-%{%4Y%|/???%}";
1244     static const char* patent_format  = "%{%2D%|01%}-%{%3N%|JAN%}-%Y";
1245 
1246     const char* format = ( format_choice == eDateToString_cit_sub ?
1247         cit_sub_format :
1248         ( format_choice == eDateToString_patent ? patent_format : regular_format ) );
1249 
1250     string date_str;
1251     date.GetDate(&date_str, format);
1252     NStr::ToUpper(date_str);
1253     str.append(date_str);
1254 }
1255 
1256 
GetDeltaSeqSummary(const CBioseq_Handle & seq,SDeltaSeqSummary & summary)1257 void GetDeltaSeqSummary(const CBioseq_Handle& seq, SDeltaSeqSummary& summary)
1258 {
1259     if ( !seq.IsSetInst()                                ||
1260          !seq.IsSetInst_Repr()                           ||
1261          !(seq.GetInst_Repr() == CSeq_inst::eRepr_delta) ||
1262          !seq.IsSetInst_Ext()                            ||
1263          !seq.GetInst_Ext().IsDelta() ) {
1264         return;
1265     }
1266 
1267     SDeltaSeqSummary temp;
1268     CScope& scope = seq.GetScope();
1269 
1270     const CDelta_ext::Tdata& segs = seq.GetInst_Ext().GetDelta().Get();
1271     temp.num_segs = segs.size();
1272 
1273     size_t len = 0;
1274 
1275     CNcbiOstrstream text;
1276 
1277     CDelta_ext::Tdata::const_iterator curr = segs.begin();
1278     CDelta_ext::Tdata::const_iterator end = segs.end();
1279     CDelta_ext::Tdata::const_iterator next;
1280     for ( ; curr != end; curr = next ) {
1281         {{
1282             // set next to one after curr
1283             next = curr; ++next;
1284         }}
1285         size_t from = len + 1;
1286         switch ( (*curr)->Which() ) {
1287         case CDelta_seq::e_Loc:
1288             {{
1289                 const CDelta_seq::TLoc& loc = (*curr)->GetLoc();
1290                 if ( loc.IsNull() ) {  // gap
1291                     ++temp.num_gaps;
1292                     text << "* " << from << ' ' << len
1293                          << " gap of unknown length~";
1294                 } else {  // count length
1295                     size_t tlen = sequence::GetLength(loc, &scope);
1296                     len += tlen;
1297                     temp.residues += tlen;
1298                     text << "* " << setw(8) << from << ' ' << setw(8) << len
1299                          << ": contig of " << tlen << " bp in length~";
1300                 }
1301             }}
1302             break;
1303         case CDelta_seq::e_Literal:
1304             {{
1305                 const CDelta_seq::TLiteral& lit = (*curr)->GetLiteral();
1306                 size_t lit_len = lit.CanGetLength() ? lit.GetLength() : 0;
1307                 len += lit_len;
1308                 if ( lit.CanGetSeq_data() && lit.GetSeq_data().Which() != CSeq_data::e_Gap ) {
1309                     temp.residues += lit_len;
1310                     while ( next != end  &&  (*next)->IsLiteral()  &&
1311                         (*next)->GetLiteral().CanGetSeq_data()  &&
1312                         (*next)->GetLiteral().GetSeq_data().Which() != CSeq_data::e_Gap ) {
1313                         const CDelta_seq::TLiteral& next_lit = (*next)->GetLiteral();
1314                         size_t next_len = next_lit.CanGetLength() ?
1315                             next_lit.GetLength() : 0;
1316                         lit_len += next_len;
1317                         len += next_len;
1318                         temp.residues += next_len;
1319                         ++next;
1320                     }
1321                     text << "* " << setw(8) << from << ' ' << setw(8) << len
1322                          << ": contig of " << lit_len << " bp in length~";
1323                 } else {
1324                     bool unk = false;
1325                     ++temp.num_gaps;
1326                     if ( lit.CanGetFuzz() ) {
1327                         const CSeq_literal::TFuzz& fuzz = lit.GetFuzz();
1328                         if ( fuzz.IsLim()  &&
1329                              fuzz.GetLim() == CInt_fuzz::eLim_unk ) {
1330                             unk = true;
1331                             ++temp.num_faked_gaps;
1332                             if ( from > len ) {
1333                                 text << "*                    gap of unknown length~";
1334                             } else {
1335                                 text << "* " << setw(8) << from << ' ' << setw(8) << len
1336                                      << ": gap of unknown length~";
1337                             }
1338                         }
1339                     }
1340                     if ( !unk ) {
1341                         text << "* " << setw(8) << from << " " << setw(8) << len
1342                              << ": gap of " << lit_len << " bp~";
1343                     }
1344                 }
1345             }}
1346             break;
1347 
1348         default:
1349             break;
1350         }
1351     }
1352     summary = temp;
1353     summary.text = CNcbiOstrstreamToString(text);
1354 }
1355 
1356 
1357 SAFE_CONST_STATIC_STRING(kTS_concept_trans,    "conceptual translation");
1358 SAFE_CONST_STATIC_STRING(kTS_concept_trans_a,  "conceptual translation supplied by author");
1359 SAFE_CONST_STATIC_STRING(kTS_both,             "conceptual translation with partial peptide sequencing");
1360 SAFE_CONST_STATIC_STRING(kTS_seq_pept,         "direct peptide sequencing");
1361 SAFE_CONST_STATIC_STRING(kTS_seq_pept_homol,   "sequenced peptide, ordered by homology");
1362 SAFE_CONST_STATIC_STRING(kTS_seq_pept_overlap, "sequenced peptide, ordered by overlap");
1363 
GetTechString(int tech)1364 const string& GetTechString(int tech)
1365 {
1366 
1367     switch ( tech ) {
1368     case CMolInfo::eTech_concept_trans:
1369         return kTS_concept_trans.Get();
1370 
1371     case CMolInfo::eTech_seq_pept :
1372         return kTS_seq_pept.Get();
1373 
1374     case CMolInfo::eTech_both:
1375         return kTS_both.Get();
1376 
1377     case CMolInfo::eTech_seq_pept_overlap:
1378         return kTS_seq_pept_overlap.Get();
1379 
1380     case CMolInfo::eTech_seq_pept_homol:
1381         return kTS_seq_pept_homol.Get();
1382 
1383     case CMolInfo::eTech_concept_trans_a:
1384         return kTS_concept_trans_a.Get();
1385 
1386     default:
1387         return kEmptyStr;
1388     }
1389 
1390     return kEmptyStr;
1391 }
1392 
1393 
s_IsModelEvidanceUop(const CUser_object & uo)1394 bool s_IsModelEvidanceUop(const CUser_object& uo)
1395 {
1396     return (uo.CanGetType()  &&  uo.GetType().IsStr()  &&
1397         uo.GetType().GetStr() == "ModelEvidence");
1398 }
1399 
1400 
s_FindModelEvidanceUop(const CUser_object & uo)1401 const CUser_object* s_FindModelEvidanceUop(const CUser_object& uo)
1402 {
1403     if ( s_IsModelEvidanceUop(uo) ) {
1404         return &uo;
1405     }
1406 
1407     const CUser_object* temp = 0;
1408     ITERATE (CUser_object::TData, ufi, uo.GetData()) {
1409         const CUser_field& uf = **ufi;
1410         if ( !uf.CanGetData() ) {
1411             continue;
1412         }
1413         const CUser_field::TData& data = uf.GetData();
1414 
1415         switch ( data.Which() ) {
1416         case CUser_field::TData::e_Object:
1417             temp = s_FindModelEvidanceUop(data.GetObject());
1418             break;
1419 
1420         case CUser_field::TData::e_Objects:
1421             ITERATE (CUser_field::TData::TObjects, obj, data.GetObjects()) {
1422                 temp = s_FindModelEvidanceUop(**obj);
1423                 if ( temp != 0 ) {
1424                     break;
1425                 }
1426             }
1427             break;
1428 
1429         default:
1430             break;
1431         }
1432         if ( temp != 0 ) {
1433             break;
1434         }
1435     }
1436 
1437     return temp;
1438 }
1439 
1440 
s_GetModelEvidance(const CBioseq_Handle & bsh,SModelEvidance & me)1441 bool s_GetModelEvidance(const CBioseq_Handle& bsh, SModelEvidance& me)
1442 {
1443     CConstRef<CUser_object> moduop;
1444     bool result = false;
1445 
1446     for (CSeqdesc_CI it(bsh, CSeqdesc::e_User);  it;  ++it) {
1447         moduop.Reset(s_FindModelEvidanceUop(it->GetUser()));
1448         if (moduop.NotEmpty()) {
1449             result = true;
1450             CConstRef<CUser_field> ufp;
1451             if( moduop->HasField("Contig Name") ) {
1452                 ufp = &(moduop->GetField("Contig Name"));
1453                 if ( ufp.NotEmpty()  &&  ufp->IsSetData()  &&  ufp->GetData().IsStr() ) {
1454                     me.name = ufp->GetData().GetStr();
1455                 }
1456             }
1457             if( moduop->HasField("Assembly") ) {
1458                 ufp = &(moduop->GetField("Assembly"));
1459                 if ( ufp.NotEmpty()  &&  ufp->IsSetData()  &&  ufp->GetData().IsFields() ) {
1460                     ITERATE(CUser_field::C_Data::TFields, fld_itr, ufp->GetData().GetFields()) {
1461                         const CUser_field& field = **fld_itr;
1462                         ITERATE(CUser_field::C_Data::TFields, inr_itr, field.GetData().GetFields()) {
1463                             const CUser_field& ufld = **inr_itr;
1464                             if ( !ufld.IsSetLabel()  ||  !ufld.GetLabel().IsStr() ) continue;
1465                             const string& label = ufld.GetLabel().GetStr();
1466                             if (label != "accession") continue;
1467                             const CUser_field::C_Data& data = ufld.GetData();
1468                             if (data.IsStr()) {
1469                                 const string& accn = data.GetStr();
1470                                 me.assembly.push_back(accn);
1471                             }
1472                         }
1473                     }
1474                 }
1475             }
1476             if ( moduop->HasField("Method") ) {
1477                 ufp = &(moduop->GetField("Method"));
1478                 if ( ufp.NotEmpty()  &&  ufp->IsSetData()  &&  ufp->GetData().IsStr() ) {
1479                     me.method = ufp->GetData().GetStr();
1480                 }
1481             }
1482             if ( moduop->HasField("Counts") ) {
1483                 ufp = &(moduop->GetField("Counts"));
1484                 if ( ufp->HasField("mRNA")) {
1485                      me.mrnaEv = true;
1486                 }
1487                 if ( ufp->HasField("EST")) {
1488                      me.estEv = true;
1489                 }
1490             }
1491             if ( moduop->HasField("mRNA") ) {
1492                 me.mrnaEv = true;
1493             }
1494             if ( moduop->HasField("EST") ) {
1495                 me.estEv = true;
1496             }
1497             if( moduop->HasField("Contig Gi") ) {
1498                 ufp = &(moduop->GetField("Contig Gi"));
1499                 if ( ufp.NotEmpty()  &&  ufp->IsSetData()  &&  ufp->GetData().IsInt() ) {
1500                     me.gi = GI_FROM(CUser_field::C_Data::TInt, ufp->GetData().GetInt());
1501                 }
1502             }
1503             if( moduop->HasField("Contig Span") ) {
1504                 ufp = &(moduop->GetField("Contig Span"));
1505                 if ( ufp.NotEmpty()  &&  ufp->IsSetData()  &&  ufp->GetData().IsInts()
1506                     && ufp->IsSetNum() && ufp->GetNum() == 2 && ufp->GetData().GetInts().size() == 2 )
1507                 {
1508                     const CUser_field::C_Data::TInts & int_list = ufp->GetData().GetInts();
1509                     me.span.first  = int_list[0];
1510                     me.span.second = int_list[1];
1511                 }
1512             }
1513         }
1514     }
1515 
1516     // if me.name is missing version, try to update from me.gi
1517     if( me.gi > ZERO_GI && me.name.find('.') == string::npos ) {
1518         CSeq_id_Handle accver_idh = bsh.GetScope().GetAccVer( CSeq_id_Handle::GetGiHandle(me.gi) );
1519         if( accver_idh ) {
1520             CConstRef<CSeq_id> accver_seq_id = accver_idh.GetSeqIdOrNull();
1521             if( accver_seq_id ) {
1522                 const CTextseq_id *text_id = accver_seq_id->GetTextseq_Id();
1523                 if( text_id && text_id->IsSetAccession() && text_id->IsSetVersion() ) {
1524                     me.name = text_id->GetAccession() + "." + NStr::IntToString(text_id->GetVersion());
1525                 }
1526             }
1527         }
1528     }
1529 
1530     return result;
1531 }
1532 
1533 
GetModelEvidance(const CBioseq_Handle & bsh,SModelEvidance & me)1534 bool GetModelEvidance(const CBioseq_Handle& bsh, SModelEvidance& me)
1535 {
1536     if ( s_GetModelEvidance(bsh, me) ) {
1537         return true;
1538     }
1539 
1540     if ( CSeq_inst::IsAa(bsh.GetInst_Mol()) ) {
1541         CBioseq_Handle nuc = sequence::GetNucleotideParent(bsh);
1542         if ( nuc  ) {
1543             return s_GetModelEvidance(nuc, me);
1544         }
1545     }
1546 
1547     return false;
1548 }
1549 
1550 
1551 // in Ncbistdaa order
1552 static const char* kAANames[] = {
1553     "---", "Ala", "Asx", "Cys", "Asp", "Glu", "Phe", "Gly", "His", "Ile",
1554     "Lys", "Leu", "Met", "Asn", "Pro", "Gln", "Arg", "Ser", "Thr", "Val",
1555     "Trp", "OTHER", "Tyr", "Glx", "Sec", "TERM", "Pyl", "Xle"
1556 };
1557 
1558 
GetAAName(unsigned char aa,bool is_ascii)1559 const char* GetAAName(unsigned char aa, bool is_ascii)
1560 {
1561     if (is_ascii) {
1562         aa = (unsigned char)
1563              CSeqportUtil::GetMapToIndex(CSeq_data::e_Ncbieaa,
1564                                          CSeq_data::e_Ncbistdaa, aa);
1565     }
1566     return (aa < sizeof(kAANames)/sizeof(*kAANames)) ? kAANames[aa] : "OTHER";
1567 }
1568 
1569 //////////////////////////////////////////////////////////////////////////////
1570 
GetResolveOrder(CScope & scope,const CSeq_id_Handle & mrna,const CSeq_id_Handle & prot,CBioseq_Handle & mrna_bsh,CBioseq_Handle & prot_bsh)1571 EResolveOrder GetResolveOrder(CScope& scope,
1572                               const CSeq_id_Handle& mrna,
1573                               const CSeq_id_Handle& prot,
1574                               CBioseq_Handle& mrna_bsh,
1575                               CBioseq_Handle& prot_bsh)
1576 {
1577     EResolveOrder order = eResolve_NotFound;
1578 
1579     if (order == eResolve_NotFound) {
1580         CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
1581         local_scope->AddDefaults();
1582 
1583         CBioseq_Handle possible_mrna = local_scope->GetBioseqHandle(mrna);
1584         CBioseq_Handle possible_prot;
1585         if (possible_mrna) {
1586             possible_prot =
1587                 possible_mrna.GetTopLevelEntry().GetBioseqHandle(prot);
1588         }
1589         if (possible_mrna  &&  possible_prot) {
1590             order = eResolve_RnaFirst;
1591         }
1592     }
1593 
1594     if (order == eResolve_NotFound) {
1595         CRef<CScope> local_scope(new CScope(*CObjectManager::GetInstance()));
1596         local_scope->AddDefaults();
1597 
1598         CBioseq_Handle possible_prot = local_scope->GetBioseqHandle(prot);
1599         CBioseq_Handle possible_mrna;
1600         if (possible_prot) {
1601             possible_mrna =
1602                 possible_prot.GetTopLevelEntry().GetBioseqHandle(mrna);
1603         }
1604 
1605         if (possible_mrna  &&  possible_prot) {
1606             order = eResolve_ProtFirst;
1607         }
1608     }
1609 
1610     switch (order) {
1611     case eResolve_NotFound:
1612         mrna_bsh = CBioseq_Handle();
1613         prot_bsh = CBioseq_Handle();
1614         break;
1615 
1616     case eResolve_RnaFirst:
1617         mrna_bsh = scope.GetBioseqHandle(mrna);
1618         prot_bsh = scope.GetBioseqHandle(prot);
1619         break;
1620 
1621     case eResolve_ProtFirst:
1622         prot_bsh = scope.GetBioseqHandle(prot);
1623         mrna_bsh = scope.GetBioseqHandle(mrna);
1624         break;
1625     }
1626 
1627     return order;
1628 }
1629 
1630 //////////////////////////////////////////////////////////////////////////////
1631 // HTML utils and strings
1632 
1633 //  ============================================================================
1634 //  Link locations:
1635 //  ============================================================================
1636 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseNuc =
1637     "https://www.ncbi.nlm.nih.gov/nuccore/";
1638 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseProt =
1639     "https://www.ncbi.nlm.nih.gov/protein/";
1640 
1641 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseEntrezViewer =
1642     "https://www.ncbi.nlm.nih.gov/entrez/viewer.fcgi?val="; // https forwarded to http
1643 
1644 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseTaxonomy  =
1645     "https://www.ncbi.nlm.nih.gov/Taxonomy/Browser/wwwtax.cgi?";
1646 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseTransTable =
1647     "https://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi?mode=c#SG";
1648 NCBI_XOBJEDIT_EXPORT const char* strLinkBasePubmed =
1649     "https://www.ncbi.nlm.nih.gov/pubmed/";
1650 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseExpasy =
1651     "https://enzyme.expasy.org/EC/"; // not government site
1652 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseNucSearch =
1653     "https://www.ncbi.nlm.nih.gov/sites/entrez?db=Nucleotide&amp;cmd=Search&amp;term=";
1654 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseGenomePrj =
1655     "https://www.ncbi.nlm.nih.gov/bioproject/";
1656 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseLatLon =
1657     "https://www.ncbi.nlm.nih.gov/projects/Sequin/latlonview.html";
1658 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseGeneOntology =
1659     "http://amigo.geneontology.org/amigo/term/GO:"; // not government site
1660 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseGeneOntologyRef =
1661     "http://www.geneontology.org/cgi-bin/references.cgi#GO_REF:"; // not government site
1662 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseUSPTO =
1663     "https://patft.uspto.gov/netacgi/nph-Parser?patentnumber=";
1664 NCBI_XOBJEDIT_EXPORT const char* strLinkBaseUniProt =
1665     "https://www.uniprot.org/uniprot/";
1666 
1667 NCBI_XOBJEDIT_EXPORT const char* strDocLink =
1668     "https://www.ncbi.nlm.nih.gov/genome/annotation_euk/process/";
1669 
1670 namespace {
1671     // make sure we're not "double-sanitizing"
1672     // (e.g. "&gt;" to "&amp;gt;")
1673     //  ============================================================================
1674     template<typename _T>
s_ShouldWeEscapeAmpersand(_T str_iter,const _T & str_iter_end)1675     bool s_ShouldWeEscapeAmpersand(
1676         _T str_iter, // yes, COPY not reference
1677         const _T &str_iter_end)
1678         //  ============================================================================
1679     {
1680         _ASSERT(*str_iter == '&');
1681 
1682         // This is a long-winded way of checking if str_iter
1683         // is at "&gt;", "&lt;", "&quot;" or "&amp;"
1684         // I'm concerned about regexes being too slow.
1685 
1686         ++str_iter;
1687         if (str_iter != str_iter_end) {
1688             switch (*str_iter) {
1689             case 'g':
1690             case 'l':
1691                 ++str_iter;
1692                 if (str_iter != str_iter_end && *str_iter == 't') {
1693                     ++str_iter;
1694                     if (str_iter != str_iter_end && *str_iter == ';') {
1695                         return false;
1696                     }
1697                 }
1698                 break;
1699             case 'a':
1700                 ++str_iter;
1701                 if (str_iter != str_iter_end && *str_iter == 'm') {
1702                     ++str_iter;
1703                     if (str_iter != str_iter_end && *str_iter == 'p') {
1704                         ++str_iter;
1705                         if (str_iter != str_iter_end && *str_iter == ';') {
1706                             return false;
1707                         }
1708                     }
1709                 }
1710                 break;
1711             case 'q':
1712                 ++str_iter;
1713                 if (str_iter != str_iter_end && *str_iter == 'u') {
1714                     ++str_iter;
1715                     if (str_iter != str_iter_end && *str_iter == 'o') {
1716                         ++str_iter;
1717                         if (str_iter != str_iter_end && *str_iter == 't') {
1718                             ++str_iter;
1719                             if (str_iter != str_iter_end && *str_iter == ';') {
1720                                 return false;
1721                             }
1722                         }
1723                     }
1724                 }
1725                 break;
1726             default:
1727                 return true;
1728             }
1729         }
1730         return true;
1731     }
1732 
1733     // see if the '<' opens an HTML tag (currently we
1734     // only check for a few kinds of tags )
1735     //  ============================================================================
1736     template<typename _T>
s_IsTagStart(const _T & str_iter,const _T & str_iter_end)1737     bool s_IsTagStart(
1738         const _T &str_iter,
1739         const _T &str_iter_end)
1740         //  ============================================================================
1741     {
1742         static const char* possible_tag_starts[] = {
1743             "<a href=",
1744             "<acronym title",
1745             "</a>",
1746             "</acronym"
1747         };
1748         static const size_t num_possible_tag_starts =
1749             (sizeof(possible_tag_starts) / sizeof(possible_tag_starts[0]));
1750 
1751         // check every string it might start with
1752         for (int possible_str_idx = 0; possible_str_idx < num_possible_tag_starts; ++possible_str_idx) {
1753             const string expected_str = possible_tag_starts[possible_str_idx];
1754 
1755             string::size_type idx = 0;
1756             _T check_str_iter = str_iter;
1757             for (; check_str_iter != str_iter_end && idx < expected_str.length(); ++idx, ++check_str_iter) {
1758                 if (*check_str_iter != expected_str[idx]) {
1759                     break;
1760                 }
1761             }
1762 
1763             if (idx == expected_str.length()) {
1764                 return true;
1765             }
1766         }
1767 
1768         // we're in a tag if we matched the whole expected_str
1769         return false;
1770     }
1771 
1772 }
1773 
ConvertQuotesNotInHTMLTags(string & str)1774 bool ConvertQuotesNotInHTMLTags(string &str)
1775 {
1776     bool changes_made = false;
1777 
1778     bool in_tag = false;
1779     size_t idx = 0;
1780     for (; idx < str.length(); ++idx) {
1781         switch (str[idx]) {
1782         case '<':
1783             // heuristic
1784             in_tag = true;
1785             break;
1786         case '>':
1787             in_tag = false;
1788             break;
1789         case '"':
1790             if (!in_tag) {
1791                 str[idx] = '\'';
1792                 changes_made = true;
1793             }
1794             break;
1795         }
1796     }
1797 
1798     return changes_made;
1799 }
1800 
1801 
1802 //  ============================================================================
TryToSanitizeHtml(string & str)1803 void TryToSanitizeHtml(string &str)
1804 {
1805     string result;
1806     // The "* 1.1" should keep up efficient in most cases since data tends not to have
1807     // too many characters that need escaping.
1808     result.reserve(1 + (int)((double)str.length() * 1.1));
1809     TryToSanitizeHtml(result, str);
1810 
1811     // swap is faster than assignment
1812     str.swap(result);
1813 }
1814 
TryToSanitizeHtml(std::string & result,const CTempString & str)1815 void TryToSanitizeHtml(std::string &result, const CTempString& str)
1816 //  ============================================================================
1817 {
1818     result.clear();
1819 
1820     // we only sanitize when we're not in an url
1821     bool in_html_tag = false;
1822     ITERATE(CTempString, str_iter, str) {
1823         // see if we're entering an HTML tag
1824         if (!in_html_tag && *str_iter == '<' && s_IsTagStart(str_iter, str.end())) {
1825             in_html_tag = true;
1826         }
1827 
1828         // now that we know whether we're in a tag,
1829         // process characters appropriately.
1830         if (in_html_tag) {
1831             switch (*str_iter) {
1832             case '&':
1833                 // make sure we're not "double-sanitizing"
1834                 // (e.g. "&gt;" to "&amp;gt;")
1835                 if (s_ShouldWeEscapeAmpersand(str_iter, str.end())) {
1836                     result += "&amp;";
1837                 }
1838                 else {
1839                     result += '&';
1840                 }
1841                 break;
1842             default:
1843                 result += *str_iter;
1844                 break;
1845             }
1846         }
1847         else {
1848             switch (*str_iter) {
1849             case '<':
1850                 result += "&lt;";
1851                 break;
1852             case '>':
1853                 result += "&gt;";
1854                 break;
1855             default:
1856                 result += *str_iter;
1857                 break;
1858             }
1859         }
1860 
1861         // see if we're exiting an HTML tag
1862         if (in_html_tag && *str_iter == '>') {
1863             // tag is closed now
1864             // (Note: does this consider cases where '>' is in quotes?)
1865             in_html_tag = false;
1866         }
1867     }
1868 }
1869 
1870 void
TryToSanitizeHtmlList(std::list<std::string> & strs)1871 TryToSanitizeHtmlList( std::list<std::string> &strs )
1872 {
1873     NON_CONST_ITERATE( std::list<std::string>, str_iter, strs ) {
1874         TryToSanitizeHtml( *str_iter );
1875     }
1876 }
1877 
1878 bool
CommentHasSuspiciousHtml(const string & str)1879 CommentHasSuspiciousHtml( const string &str )
1880 {
1881     // list is not complete, still need to take proper precautions
1882     static const char* bad_html_strings[] = {
1883         "<script", "<object", "<applet", "<embed", "<form",
1884         "javascript:", "vbscript:"
1885     };
1886 
1887     // load matching fsa if not already done
1888     static CSafeStatic<CTextFsa> fsa;
1889     if( ! fsa->IsPrimed() ) {
1890         for( size_t ii = 0; ii < ArraySize(bad_html_strings); ++ii ) {
1891             fsa->AddWord( bad_html_strings[ii] );
1892         }
1893         fsa->Prime();
1894     }
1895 
1896     // do the match
1897     int current_state = 0;
1898     for ( SIZE_TYPE str_idx = 0 ; str_idx < str.length(); ++str_idx) {
1899         const char ch = str[str_idx];
1900         int next_state = fsa->GetNextState (current_state, ch);
1901         if (fsa->IsMatchFound (next_state)) {
1902             return true;
1903         }
1904         current_state = next_state;
1905     }
1906 
1907     return false;
1908 }
1909 
1910 
1911 END_SCOPE(objects)
1912 END_NCBI_SCOPE
1913