1 /* utilfun.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  utilfun.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Utility functions for parser and indexing.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include <corelib/ncbistr.hpp>
41 #include <objmgr/scope.hpp>
42 #include <objmgr/object_manager.hpp>
43 #include <objects/seq/MolInfo.hpp>
44 #include <objects/seqloc/PDB_seq_id.hpp>
45 #include <corelib/tempstr.hpp>
46 
47 #include "index.h"
48 
49 #include "ftaerr.hpp"
50 #include "indx_def.h"
51 #include "utilfun.h"
52 
53 #ifdef THIS_FILE
54 #    undef THIS_FILE
55 #endif
56 #define THIS_FILE "utilfun.cpp"
57 
58 BEGIN_NCBI_SCOPE;
59 
60 USING_SCOPE(objects);
61 
GetScope()62 CScope& GetScope()
63 {
64     static CScope scope(*CObjectManager::GetInstance());
65     return scope;
66 }
67 
68 
69 static const char *ParFlat_EST_kw_array[] = {
70     "EST",
71     "EST PROTO((expressed sequence tag)",
72     "expressed sequence tag",
73     "EST (expressed sequence tag)",
74     "EST (expressed sequence tags)",
75     "EST(expressed sequence tag)",
76     "transcribed sequence fragment",
77     NULL
78 };
79 
80 static const char *ParFlat_GSS_kw_array[] = {
81     "GSS",
82     "GSS (genome survey sequence)",
83     "trapped exon",
84     NULL
85 };
86 
87 static const char *ParFlat_STS_kw_array[] = {
88     "STS",
89     "STS(sequence tagged site)",
90     "STS (sequence tagged site)",
91     "STS sequence",
92     "sequence tagged site",
93     NULL
94 };
95 
96 static const char *ParFlat_HTC_kw_array[] = {
97     "HTC",
98     NULL
99 };
100 
101 static const char *ParFlat_FLI_kw_array[] = {
102     "FLI_CDNA",
103     NULL
104 };
105 
106 static const char *ParFlat_WGS_kw_array[] = {
107     "WGS",
108     NULL
109 };
110 
111 static const char *ParFlat_MGA_kw_array[] = {
112     "MGA",
113     "CAGE (Cap Analysis Gene Expression)",
114     "5'-SAGE",
115     NULL
116 };
117 
118 static const char *ParFlat_MGA_more_kw_array[] = {
119     "CAGE (Cap Analysis Gene Expression)",
120     "5'-SAGE",
121     "5'-end tag",
122     "unspecified tag",
123     "small RNA",
124     NULL
125 };
126 
127 /* Any change of contents of next array below requires proper
128  * modifications in function fta_tsa_keywords_check().
129  */
130 static const char *ParFlat_TSA_kw_array[] = {
131     "TSA",
132     "Transcriptome Shotgun Assembly",
133     NULL
134 };
135 
136 /* Any change of contents of next array below requires proper
137  * modifications in function fta_tls_keywords_check().
138  */
139 static const char *ParFlat_TLS_kw_array[] = {
140     "TLS",
141     "Targeted Locus Study",
142     NULL
143 };
144 
145 /* Any change of contents of next 2 arrays below requires proper
146  * modifications in function fta_tpa_keywords_check().
147  */
148 static const char *ParFlat_TPA_kw_array[] = {
149     "TPA",
150     "THIRD PARTY ANNOTATION",
151     "THIRD PARTY DATA",
152     "TPA:INFERENTIAL",
153     "TPA:EXPERIMENTAL",
154     "TPA:REASSEMBLY",
155     "TPA:ASSEMBLY",
156     "TPA:SPECIALIST_DB",
157     NULL
158 };
159 
160 static const char *ParFlat_TPA_kw_array_to_remove[] = {
161     "TPA",
162     "THIRD PARTY ANNOTATION",
163     "THIRD PARTY DATA",
164     NULL
165 };
166 
167 static const char *ParFlat_ENV_kw_array[] = {
168     "ENV",
169     NULL
170 };
171 
172 /**********************************************************/
FTAitoa(Int4 m)173 static std::string FTAitoa(Int4 m)
174 {
175     Int4 sign = (m < 0) ? -1 : 1;
176     std::string res;
177 
178     for(m *= sign; m > 9; m /= 10)
179         res += m % 10 + '0';
180 
181     res += m + '0';
182 
183     if(sign < 0)
184         res += '-';
185 
186     std::reverse(res.begin(), res.end());
187     return res;
188 }
189 
190 /**********************************************************/
UnwrapAccessionRange(const objects::CGB_block::TExtra_accessions & extra_accs,objects::CGB_block::TExtra_accessions & hist)191 void UnwrapAccessionRange(const objects::CGB_block::TExtra_accessions& extra_accs, objects::CGB_block::TExtra_accessions& hist)
192 {
193     Int4       num1;
194     Int4       num2;
195 
196     objects::CGB_block::TExtra_accessions ret;
197 
198     ITERATE(objects::CGB_block::TExtra_accessions, acc, extra_accs)
199     {
200         std::string str = *acc;
201         if (str.empty())
202             continue;
203 
204         size_t dash = str.find('-');
205         if (dash == std::string::npos)
206         {
207             ret.push_back(str);
208             continue;
209         }
210 
211         std::string first(str.begin(), str.begin() + dash),
212                     last(str.begin() + dash + 1, str.end());
213         size_t acclen = first.size();
214 
215         const Char* p = first.c_str();
216         for (; (*p >= 'A' && *p <= 'Z') || *p == '_';)
217             p++;
218 
219         size_t preflen = p - first.c_str();
220 
221         std::string prefix = first.substr(0, preflen);
222         while(*p == '0')
223             p++;
224 
225         const Char* q = p;
226         for (q = p; *p >= '0' && *p <= '9';)
227             p++;
228         num1 = atoi(q);
229 
230         for (p = last.c_str() + preflen; *p == '0';)
231             p++;
232         for(q = p; *p >= '0' && *p <= '9';)
233             p++;
234         num2 = atoi(q);
235 
236         ret.push_back(first);
237 
238         if(num1 == num2)
239             continue;
240 
241         for (num1++; num1 <= num2; num1++)
242         {
243             std::string new_acc = prefix;
244 
245             std::string num_str = FTAitoa(num1);
246             size_t j = acclen - preflen - num_str.size();
247 
248             for(size_t i = 0; i < j; i++)
249                 new_acc += '0';
250 
251             new_acc += num_str;
252             ret.push_back(new_acc);
253         }
254     }
255 
256     ret.swap(hist);
257 }
258 
sIsPrefixChar(char c)259 static bool sIsPrefixChar(char c) {
260     return ('A' <= c &&  c <= 'Z') || c == '_';
261 }
262 /**********************************************************/
ParseAccessionRange(list<string> & tokens,int skip)263 bool ParseAccessionRange(list<string>& tokens, int skip)
264 {
265     bool bad = false;
266 
267     if (tokens.empty()) {
268         return true;
269     }
270 
271     if (tokens.size() <= skip+1) {
272         return true;
273     }
274 
275 
276 
277     auto it = tokens.begin();
278     if (skip) {
279         advance(it, skip);
280     }
281 
282     for (; it != tokens.end(); ++it) {
283         const auto& token = *it;
284         if (token.empty()) {
285             continue;
286         }
287 
288         CTempString first, last;
289         if (!NStr::SplitInTwo(token, "-", first, last)) {
290             continue;
291         }
292         if (first.size() != last.size()) {
293             bad = true;
294             break;
295         }
296 
297         auto first_it =
298             find_if_not(begin(first), end(first), sIsPrefixChar);
299 
300         if (first_it == first.end()) {
301             bad = true;
302             break;
303         }
304 
305 
306         auto last_it =
307             find_if_not(begin(last), end(last), sIsPrefixChar);
308         if (last_it == last.end()) {
309             bad = true;
310             break;
311         }
312 
313         auto prefixLength = distance(first.begin(), first_it);
314         if (prefixLength != distance(last.begin(), last_it) ||
315             !NStr::EqualCase(first, 0, prefixLength, last.substr(0, prefixLength))) {
316             ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch,
317                       "Inconsistent prefix found in secondary accession range \"%s\".",
318                       token.c_str());
319             break;
320         }
321 
322         auto num1 = NStr::StringToInt(first.substr(prefixLength));
323         auto num2 = NStr::StringToInt(last.substr(prefixLength));
324 
325         if  (num2 <= num1) {
326             ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
327                       "Invalid start/end values in secondary accession range \"%s\".",
328                       token.c_str());
329         }
330 
331         *it = first;
332         it = tokens.insert(it, "-");
333         it = tokens.insert(it, last);
334     }
335 
336 
337 /*
338     for(bad = false; tbp != NULL; tbp = tbpnext)
339     {
340         tbpnext = tbp->next;
341         if(tbp->str == NULL)
342             continue;
343         dash = StringChr(tbp->str, '-');
344         if(dash == NULL)
345             continue;
346         *dash = '\0';
347         first = tbp->str;
348         last = dash + 1;
349         if(StringLen(first) != StringLen(last) || *first < 'A' ||
350            *first > 'Z' || *last < 'A' || *last > 'Z')
351         {
352             *dash = '-';
353             bad = true;
354             break;
355         }
356 
357         for(p = first; (*p >= 'A' && *p <= 'Z') || *p == '_';)
358             p++;
359         if(*p < '0' || *p > '9')
360         {
361             *dash = '-';
362             bad = true;
363             break;
364         }
365         for(q = last; (*q >= 'A' && *q <= 'Z') || *q == '_';)
366             q++;
367         if(*q < '0' || *q > '9')
368         {
369             *dash = '-';
370             bad = true;
371             break;
372         }
373         size_t preflen = p - first;
374         if(preflen != (size_t) (q - last) || StringNCmp(first, last, preflen) != 0)
375         {
376             *dash = '-';
377             ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch,
378                       "Inconsistent prefix found in secondary accession range \"%s\".",
379                       tbp->str);
380             break;
381         }
382 
383 
384         while(*p == '0') // ignore all the zeros
385             p++;
386         for(q = p; *p >= '0' && *p <= '9';)
387             p++;
388         if(*p != '\0')
389         {
390             *dash = '-';
391             bad = true;
392             break;
393         }
394         num1 = atoi(q); // the first number
395 
396         for(p = last + preflen; *p == '0';)
397             p++;
398         for(q = p; *p >= '0' && *p <= '9';)
399             p++;
400         if(*p != '\0')
401         {
402             *dash = '-';
403             bad = true;
404             break;
405         }
406         num2 = atoi(q);
407 
408         if(num1 > num2)
409         {
410             *dash = '-';
411             ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
412                       "Invalid start/end values in secondary accession range \"%s\".",
413                       tbp->str);
414             break;
415         }
416 
417         tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
418         tbp = tbp->next;
419         tbp->str = StringSave("-");
420         tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
421         tbp = tbp->next;
422         tbp->str = StringSave(last);
423         tsbp->num += 2;
424 
425         tbp->next = tbpnext;
426     }
427     if(tbp == NULL)
428         return true;
429     */
430     if(bad)
431     {
432         ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
433                   "Incorrect secondary accession range provided: \"%s\".",
434                   it->c_str());
435     }
436     return false;
437 }
438 
439 /**********************************************************/
ParseAccessionRange(TokenStatBlkPtr tsbp,Int4 skip)440 bool ParseAccessionRange(TokenStatBlkPtr tsbp, Int4 skip)
441 {
442     TokenBlkPtr tbp;
443     TokenBlkPtr tbpnext;
444     char*     dash;
445     char*     first;
446     char*     last;
447     char*     p;
448     char*     q;
449     bool        bad;
450     Int4        num1;
451     Int4        num2;
452 
453     if(tsbp->list == NULL)
454         return true;
455 
456     tbp = NULL;
457     if(skip == 0)
458         tbp = tsbp->list;
459     else if(skip == 1)
460     {
461         if(tsbp->list != NULL)
462             tbp = tsbp->list->next;
463     }
464     else
465     {
466         if(tsbp->list != NULL && tsbp->list->next != NULL)
467             tbp = tsbp->list->next->next;
468     }
469     if(tbp == NULL)
470         return true;
471 
472     for(bad = false; tbp != NULL; tbp = tbpnext)
473     {
474         tbpnext = tbp->next;
475         if(tbp->str == NULL)
476             continue;
477         dash = StringChr(tbp->str, '-');
478         if(dash == NULL)
479             continue;
480         *dash = '\0';
481         first = tbp->str;
482         last = dash + 1;
483         if(StringLen(first) != StringLen(last) || *first < 'A' ||
484            *first > 'Z' || *last < 'A' || *last > 'Z')
485         {
486             *dash = '-';
487             bad = true;
488             break;
489         }
490 
491         for(p = first; (*p >= 'A' && *p <= 'Z') || *p == '_';)
492             p++;
493         if(*p < '0' || *p > '9')
494         {
495             *dash = '-';
496             bad = true;
497             break;
498         }
499         for(q = last; (*q >= 'A' && *q <= 'Z') || *q == '_';)
500             q++;
501         if(*q < '0' || *q > '9')
502         {
503             *dash = '-';
504             bad = true;
505             break;
506         }
507         size_t preflen = p - first;
508         if(preflen != (size_t) (q - last) || StringNCmp(first, last, preflen) != 0)
509         {
510             *dash = '-';
511             ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch,
512                       "Inconsistent prefix found in secondary accession range \"%s\".",
513                       tbp->str);
514             break;
515         }
516 
517         while(*p == '0')
518             p++;
519         for(q = p; *p >= '0' && *p <= '9';)
520             p++;
521         if(*p != '\0')
522         {
523             *dash = '-';
524             bad = true;
525             break;
526         }
527         num1 = atoi(q);
528 
529         for(p = last + preflen; *p == '0';)
530             p++;
531         for(q = p; *p >= '0' && *p <= '9';)
532             p++;
533         if(*p != '\0')
534         {
535             *dash = '-';
536             bad = true;
537             break;
538         }
539         num2 = atoi(q);
540 
541         if(num1 > num2)
542         {
543             *dash = '-';
544             ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
545                       "Invalid start/end values in secondary accession range \"%s\".",
546                       tbp->str);
547             break;
548         }
549 
550         tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
551         tbp = tbp->next;
552         tbp->str = StringSave("-");
553         tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
554         tbp = tbp->next;
555         tbp->str = StringSave(last);
556         tsbp->num += 2;
557 
558         tbp->next = tbpnext;
559     }
560     if(tbp == NULL)
561         return true;
562     if(bad)
563     {
564         ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
565                   "Incorrect secondary accession range provided: \"%s\".",
566                   tbp->str);
567     }
568     return false;
569 }
570 
571 /**********************************************************/
TokenNodeNew(TokenBlkPtr tbp)572 static TokenBlkPtr TokenNodeNew(TokenBlkPtr tbp)
573 {
574     TokenBlkPtr newnode = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
575 
576     if(tbp != NULL)
577     {
578         while(tbp->next != NULL)
579             tbp = tbp->next;
580         tbp->next = newnode;
581     }
582 
583     return(newnode);
584 }
585 
586 /**********************************************************/
InsertTokenVal(TokenBlkPtr * tbp,char * str)587 static void InsertTokenVal(TokenBlkPtr* tbp, char* str)
588 {
589     TokenBlkPtr ltbp;
590 
591     ltbp = *tbp;
592     ltbp = TokenNodeNew(ltbp);
593     ltbp->str = StringSave(str);
594 
595     if(*tbp == NULL)
596         *tbp = ltbp;
597 }
598 
599 /**********************************************************
600  *
601  *   TokenStatBlkPtr TokenString(str, delimiter):
602  *
603  *      Parsing string "str" by delimiter or tab key, blank.
604  *      Parsing stop at newline ('\n') or end of string ('\0').
605  *      Return a statistics of link list token.
606  *
607  **********************************************************/
TokenString(char * str,Char delimiter)608 TokenStatBlkPtr TokenString(char* str, Char delimiter)
609 {
610     char*         bptr;
611     char*         ptr;
612     char*         curtoken;
613     Int2            num;
614     TokenStatBlkPtr token;
615     Char            ch;
616 
617     token = (TokenStatBlkPtr) MemNew(sizeof(TokenStatBlk));
618 
619     /* skip first several delimiters if any existed
620      */
621     for(ptr = str; *ptr == delimiter;)
622         ptr++;
623 
624     for(num = 0; *ptr != '\0' && *ptr != '\n';)
625     {
626         for(bptr = ptr; *ptr != delimiter && *ptr != '\n' &&
627             *ptr != '\t' && *ptr != ' ' && *ptr != '\0';)
628             ptr++;
629 
630         ch = *ptr;
631         *ptr = '\0';
632         curtoken = StringSave(bptr);
633         *ptr = ch;
634 
635         InsertTokenVal(&token->list, curtoken);
636         num++;
637         MemFree(curtoken);
638 
639         while(*ptr == delimiter || *ptr == '\t' || *ptr == ' ')
640             ptr++;
641     }
642 
643     token->num = num;
644 
645     return(token);
646 }
647 
648 /**********************************************************
649  *
650  *   TokenStatBlkPtr TokenStringByDelimiter(str, delimiter):
651  *
652  *      Parsing string "str" by delimiter.
653  *      Parsing stop at end of string ('\0').
654  *      Return a statistics of link list token.
655  *
656  **********************************************************/
TokenStringByDelimiter(char * str,Char delimiter)657 TokenStatBlkPtr TokenStringByDelimiter(char* str, Char delimiter)
658 {
659     char*         bptr;
660     char*         ptr;
661     char*         curtoken;
662     char*         s;
663     Int2            num;
664     TokenStatBlkPtr token;
665     Char            ch;
666 
667     token = (TokenStatBlkPtr) MemNew(sizeof(TokenStatBlk));
668 
669     /* skip first several delimiters if any existed
670      */
671     for(ptr = str; *ptr == delimiter;)
672         ptr++;
673 
674     /* remove '.' from the end of the string
675      */
676     s = ptr + StringLen(ptr) - 1;
677     if(*s == '.')
678         *s = '\0';
679 
680     for(num = 0; *ptr != '\0';)
681     {
682         for(bptr = ptr; *ptr != delimiter && *ptr != '\0';)
683             ptr++;
684 
685         ch = *ptr;
686         *ptr = '\0';
687         curtoken = StringSave(bptr);
688         *ptr = ch;
689 
690         InsertTokenVal(&token->list, curtoken);
691         num++;
692         MemFree(curtoken);
693 
694         while(*ptr == delimiter || *ptr == ' ')
695             ptr++;
696     }
697 
698     token->num = num;
699 
700     return(token);
701 }
702 
703 /**********************************************************/
FreeTokenblk(TokenBlkPtr tbp)704 void FreeTokenblk(TokenBlkPtr tbp)
705 {
706     TokenBlkPtr temp;
707 
708     while(tbp != NULL)
709     {
710         temp = tbp;
711         tbp = tbp->next;
712         MemFree(temp->str);
713         MemFree(temp);
714     }
715 }
716 
717 /**********************************************************/
FreeTokenstatblk(TokenStatBlkPtr tsbp)718 void FreeTokenstatblk(TokenStatBlkPtr tsbp)
719 {
720     FreeTokenblk(tsbp->list);
721     MemFree(tsbp);
722 }
723 
724 /**********************************************************
725  *
726  *   Int2 fta_StringMatch(array, text):
727  *
728  *      Return array position of the matched length
729  *   of string in array.
730  *      Return -1 if no match.
731  *
732  **********************************************************/
fta_StringMatch(const Char ** array,const Char * text)733 Int2 fta_StringMatch(const Char **array, const Char* text)
734 {
735     Int2 i;
736 
737     if(text == NULL)
738         return(-1);
739 
740     for (i = 0; *array != NULL; i++, array++)
741     {
742         if (NStr::EqualCase(text, 0, StringLen(*array), *array))
743             break;
744     }
745 
746     if(*array == NULL)
747         return(-1);
748 
749     return(i);
750 }
751 
752 /**********************************************************
753  *
754  *   Int2 StringMatchIcase(array, text):
755  *
756  *      Return array position of the matched lenght of
757  *   string (ignored case) in array.
758  *      Return -1 if no match.
759  *
760  **********************************************************/
StringMatchIcase(const Char ** array,const Char * text)761 Int2 StringMatchIcase(const Char **array, const Char* text)
762 {
763     Int2 i;
764 
765     if(text == NULL)
766         return(-1);
767 
768     for (i = 0; *array != NULL; i++, array++)
769     {
770         // If string from an array is empty its length == 0 and would be equval to any other string
771         // The next 'if' statement will avoid that behavior
772         if (text[0] != 0 && *array[0] == 0)
773             continue;
774 
775         if (NStr::EqualNocase(text, 0, StringLen(*array), *array))
776             break;
777     }
778 
779     if(*array == NULL)
780         return(-1);
781     return(i);
782 }
783 
784 /**********************************************************
785  *
786  *   Int2 MatchArrayString(array, text):
787  *
788  *      Return array position of the string in the
789  *   array.
790  *      Return -1 if no match.
791  *
792  **********************************************************/
MatchArrayString(const char ** array,const char * text)793 Int2 MatchArrayString(const char **array, const char *text)
794 {
795     Int2 i;
796 
797     if(text == NULL)
798         return(-1);
799 
800     for (i = 0; *array != NULL; i++, array++)
801     {
802         if (NStr::Equal(*array, text))
803             break;
804     }
805 
806     if(*array == NULL)
807         return(-1);
808     return(i);
809 }
810 
811 /**********************************************************/
MatchArrayIString(const Char ** array,const Char * text)812 Int2 MatchArrayIString(const Char **array, const Char *text)
813 {
814     Int2 i;
815 
816     if(text == NULL)
817         return(-1);
818 
819     for (i = 0; *array != NULL; i++, array++)
820     {
821         // If string from an array is empty its length == 0 and would be equval to any other string
822         // The next 'if' statement will avoid that behavior
823         if (text[0] != 0 && *array[0] == 0)
824             continue;
825 
826         if (NStr::EqualNocase(*array, text))
827             break;
828     }
829 
830     if(*array == NULL)
831         return(-1);
832     return(i);
833 }
834 
835 /**********************************************************
836  *
837  *   Int2 MatchArraySubString(array, text):
838  *
839  *      Return array position of the string in the array
840  *   if any array is in the substring of "text".
841  *      Return -1 if no match.
842  *
843  **********************************************************/
MatchArraySubString(const Char ** array,const Char * text)844 Int2 MatchArraySubString(const Char **array, const Char* text)
845 {
846     Int2 i;
847 
848     if(text == NULL)
849         return(-1);
850 
851     for (i = 0; *array != NULL; i++, array++)
852     {
853         if (NStr::Find(text, *array) != NPOS)
854             break;
855     }
856 
857     if(*array == NULL)
858         return(-1);
859     return(i);
860 }
861 
862 /**********************************************************/
StringIStr(const Char * where,const Char * what)863 Char* StringIStr(const Char* where, const Char *what)
864 {
865     const Char* p;
866     const Char* q;
867 
868     if(where == NULL || *where == '\0' || what == NULL || *what == '\0')
869         return(NULL);
870 
871     q = NULL;
872     for(; *where != '\0'; where++)
873     {
874         for(q = what, p = where; *q != '\0' && *p != '\0'; q++, p++)
875         {
876             if(*q == *p)
877                 continue;
878 
879             if(*q >= 'A' && *q <= 'Z')
880             {
881                 if(*q + 32 == *p)
882                     continue;
883             }
884             else if(*q >= 'a' && *q <= 'z')
885             {
886                 if(*q - 32 == *p)
887                     continue;
888             }
889             break;
890         }
891         if(*p == '\0' || *q == '\0')
892             break;
893     }
894     if(q != NULL && *q == '\0')
895         return const_cast<char*>(where);
896     return(NULL);
897 }
898 
899 /**********************************************************/
MatchArrayISubString(const Char ** array,const Char * text)900 Int2 MatchArrayISubString(const Char **array, const Char* text)
901 {
902     Int2 i;
903 
904     if(text == NULL)
905         return(-1);
906 
907     for (i = 0; *array != NULL; i++, array++)
908     {
909         if (NStr::FindNoCase(text, *array) != NPOS)
910             break;
911     }
912 
913     if(*array == NULL)
914         return(-1);
915     return(i);
916 }
917 
918 /**********************************************************
919  *
920  *   char* GetBlkDataReplaceNewLine(bptr, eptr,
921  *                                    start_col_data):
922  *
923  *      Return a string which replace newline to blank
924  *   and skip "XX" line data.
925  *
926  **********************************************************/
GetBlkDataReplaceNewLine(char * bptr,char * eptr,Int2 start_col_data)927 char* GetBlkDataReplaceNewLine(char* bptr, char* eptr,
928                                  Int2 start_col_data)
929 {
930     char* ptr;
931 
932     if(bptr + start_col_data >= eptr)
933         return(NULL);
934 
935     size_t size = eptr - bptr;
936     char* retstr = (char*)MemNew(size + 1);
937     char* str = retstr;
938 
939     while(bptr < eptr)
940     {
941         if (NStr::Equal(bptr, 0, 2, "XX"))      /* skip XX line data */
942         {
943             ptr = SrchTheChar(bptr, eptr, '\n');
944             bptr = ptr + 1;
945             continue;
946         }
947 
948         bptr += start_col_data;
949         ptr = SrchTheChar(bptr, eptr, '\n');
950 
951         if(ptr != NULL)
952         {
953             size = ptr - bptr;
954             MemCpy(str, bptr, size);
955             str += size;
956             if(*(ptr - 1) != '-' || *(ptr - 2) == ' ')
957             {
958                 StringCpy(str, " ");
959                 str++;
960             }
961             bptr = ptr;
962         }
963         bptr++;
964     }
965 
966     std::string tstr = NStr::TruncateSpaces(std::string(retstr), NStr::eTrunc_End);
967     MemFree(retstr);
968     retstr = StringSave(tstr.c_str());
969 
970     return(retstr);
971 }
972 
973 
974 /**********************************************************/
SeekLastAlphaChar(const Char * str,size_t len)975 static size_t SeekLastAlphaChar(const Char* str, size_t len)
976 {
977     size_t ret = 0;
978     if (str != NULL && len != 0)
979     {
980         for (ret = len - 1; ret >= 0; --ret)
981         {
982             if (str[ret] != ' ' && str[ret] != '\n' && str[ret] != '\\' && str[ret] != ',' &&
983                 str[ret] != ';' && str[ret] != '~' && str[ret] != '.' && str[ret] != ':')
984             {
985                 ++ret;
986                 break;
987             }
988         }
989 
990         if (ret < 0)
991             ret = 0;
992     }
993 
994     return ret;
995 }
996 
997 /**********************************************************/
CleanTailNoneAlphaCharInString(std::string & str)998 void CleanTailNoneAlphaCharInString(std::string& str)
999 {
1000     size_t ret = SeekLastAlphaChar(str.c_str(), str.size());
1001     str = str.substr(0, ret);
1002 }
1003 
1004 /**********************************************************
1005  *
1006  *   void CleanTailNoneAlphaChar(str):
1007  *
1008  *      Delete any tailing ' ', '\n', '\\', ',', ';', '~',
1009  *   '.', ':' characters.
1010  *
1011  **********************************************************/
CleanTailNoneAlphaChar(char * str)1012 void CleanTailNoneAlphaChar(char* str)
1013 {
1014     if(str == NULL || *str == '\0')
1015         return;
1016 
1017     size_t last = SeekLastAlphaChar(str, strlen(str));
1018     str[last] = '\0';
1019 }
1020 
1021 /**********************************************************/
PointToNextToken(char * ptr)1022 char* PointToNextToken(char* ptr)
1023 {
1024     if(ptr != NULL)
1025     {
1026         while(*ptr != ' ')
1027             ptr++;
1028         while(*ptr == ' ')
1029             ptr++;
1030     }
1031     return(ptr);
1032 }
1033 
1034 /**********************************************************
1035  *
1036  *   char* GetTheCurrentToken(ptr):
1037  *
1038  *      Return the current token (also CleanTailNoneAlphaChar)
1039  *   which ptr points to and ptr will points to next token
1040  *   after the routine return.
1041  *
1042  **********************************************************/
GetTheCurrentToken(char ** ptr)1043 char* GetTheCurrentToken(char** ptr)
1044 {
1045     char* retptr;
1046     char* bptr;
1047     char* str;
1048     Char    ch;
1049 
1050     bptr = retptr = *ptr;
1051     if(retptr == NULL || *retptr == '\0')
1052         return(NULL);
1053 
1054     while(*retptr != '\0' && *retptr != ' ')
1055         retptr++;
1056 
1057     ch = *retptr;
1058     *retptr = '\0';
1059     str = StringSave(bptr);
1060     *retptr = ch;
1061 
1062     while(*retptr != '\0' && *retptr == ' ')    /* skip blanks */
1063         retptr++;
1064     *ptr = retptr;
1065 
1066     CleanTailNoneAlphaChar(str);
1067     return(str);
1068 }
1069 
1070 /**********************************************************
1071  *
1072  *   char* SrchTheChar(bptr, eptr, letter):
1073  *
1074  *      Search The character letter.
1075  *      Return NULL if not found; otherwise, return
1076  *   a pointer points first occurrence The character.
1077  *
1078  **********************************************************/
SrchTheChar(char * bptr,char * eptr,Char letter)1079 char* SrchTheChar(char* bptr, char* eptr, Char letter)
1080 {
1081     while(bptr < eptr && *bptr != letter)
1082         bptr++;
1083 
1084     if(bptr < eptr)
1085         return(bptr);
1086 
1087     return(NULL);
1088 }
1089 
1090 /**********************************************************
1091  *
1092  *   char* SrchTheStr(bptr, eptr, leadstr):
1093  *
1094  *      Search The leading string.
1095  *      Return NULL if not found; otherwise, return
1096  *   a pointer points first occurrence The leading string.
1097  *
1098  **********************************************************/
SrchTheStr(char * bptr,char * eptr,const char * leadstr)1099 char* SrchTheStr(char* bptr, char* eptr, const char *leadstr)
1100 {
1101     char* p;
1102     Char    c;
1103 
1104     c = *eptr;
1105     *eptr = '\0';
1106     p = StringStr(bptr, leadstr);
1107     *eptr = c;
1108     return(p);
1109 }
1110 
1111 /**********************************************************/
CpSeqId(InfoBioseqPtr ibp,const objects::CSeq_id & id)1112 void CpSeqId(InfoBioseqPtr ibp, const objects::CSeq_id& id)
1113 {
1114     const objects::CTextseq_id* text_id = id.GetTextseq_Id();
1115     if (text_id != nullptr)
1116     {
1117         if (text_id->IsSetName())
1118             ibp->locus = StringSave(text_id->GetName().c_str());
1119 
1120         CRef<objects::CSeq_id> new_id(new objects::CSeq_id);
1121         if (text_id->IsSetAccession())
1122         {
1123             ibp->acnum = StringSave(text_id->GetAccession().c_str());
1124 
1125             CRef<objects::CTextseq_id> new_text_id(new objects::CTextseq_id);
1126             new_text_id->SetAccession(text_id->GetAccession());
1127             if (text_id->IsSetVersion())
1128                 new_text_id->SetVersion(text_id->GetVersion());
1129 
1130             SetTextId(id.Which(), *new_id, *new_text_id);
1131         }
1132         else
1133         {
1134             new_id->Assign(id);
1135         }
1136 
1137         ibp->ids.push_back(new_id);
1138     }
1139     else {
1140         auto pId = Ref(new CSeq_id());
1141         pId->Assign(id);
1142         ibp->ids.push_back(move(pId));
1143     }
1144 }
1145 
1146 /**********************************************************/
InfoBioseqFree(InfoBioseqPtr ibp)1147 void InfoBioseqFree(InfoBioseqPtr ibp)
1148 {
1149     if (!ibp->ids.empty())
1150         ibp->ids.clear();
1151 
1152     if(ibp->locus != NULL)
1153     {
1154         MemFree(ibp->locus);
1155         ibp->locus = NULL;
1156     }
1157 
1158     if(ibp->acnum != NULL)
1159     {
1160         MemFree(ibp->acnum);
1161         ibp->acnum = NULL;
1162     }
1163 }
1164 
1165 /**********************************************************
1166     *
1167     *   CRef<objects::CDate_std> get_full_date(s, is_ref, source):
1168     *
1169     *      Get year, month, day and return CRef<objects::CDate_std>.
1170     *
1171     **********************************************************/
get_full_date(const Char * s,bool is_ref,Parser::ESource source)1172 CRef<objects::CDate_std> get_full_date(const Char* s, bool is_ref, Parser::ESource source)
1173 {
1174     static const char *months[] = { "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
1175         "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
1176 
1177     int               day = 0;
1178     int               month = 0;
1179     int               year;
1180     int               cal;
1181     Char           msg[11];
1182     const Char*        p;
1183 
1184     CRef<objects::CDate_std> date;
1185 
1186     if (s == NULL || *s == '\0')
1187         return date;
1188 
1189     if (IS_DIGIT(*s) != 0)
1190     {
1191         day = atoi(s);
1192         s += 3;
1193     }
1194 
1195     int num_of_months = sizeof(months) / sizeof(months[0]);
1196     for (cal = 0; cal < num_of_months; cal++)
1197     {
1198         if (StringNICmp(s, months[cal], 3) != 0)
1199             continue;
1200         month = cal + 1;
1201         break;
1202     }
1203 
1204     if (cal == num_of_months)
1205     {
1206         StringNCpy(msg, s, 10);
1207         msg[10] = '\0';
1208         if (is_ref)
1209             ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate,
1210             "Unrecognized month: %s", msg);
1211         else
1212             ErrPostEx(SEV_WARNING, ERR_DATE_IllegalDate,
1213             "Unrecognized month: %s", msg);
1214         return date;
1215     }
1216     p = s + 4;
1217 
1218     date = new objects::CDate_std;
1219     year = atoi(p);
1220     if ((StringNCmp(p, "19", 2) == 0 || StringNCmp(p, "20", 2) == 0 ||
1221          StringNCmp(p, "20", 2) == 0) &&
1222         p[2] >= '0' && p[2] <= '9' && p[3] >= '0' && p[3] <= '9')
1223     {
1224         CTime cur_time(CTime::eCurrent);
1225         objects::CDate_std cur(cur_time);
1226         objects::CDate_std::TYear cur_year = cur.GetYear();
1227 
1228         if (year < 1900 || year > cur_year)
1229         {
1230             if (is_ref)
1231                 ErrPostEx(SEV_ERROR, ERR_REFERENCE_IllegalDate,
1232                 "Illegal year: %d, current year: %d", year, cur_year);
1233             else
1234             {
1235                 if (source != Parser::ESource::SPROT || year - cur_year > 1)
1236                     ErrPostEx(SEV_WARNING, ERR_DATE_IllegalDate,
1237                     "Illegal year: %d, current year: %d", year, cur_year);
1238             }
1239         }
1240 
1241         date->SetYear(year);
1242     }
1243     else
1244     {
1245         if (year < 70)
1246             year += 2000;
1247         else
1248             year += 1900;
1249         date->SetYear(year);
1250     }
1251 
1252     date->SetMonth(month);
1253     date->SetDay(day);
1254 
1255     return date;
1256 }
1257 
1258 /**********************************************************
1259  *
1260  *   Int2 SrchKeyword(ptr, kwl):
1261  *
1262  *      Compare first kwl.len byte in ptr to kwl.str.
1263  *      Return the position of keyword block array;
1264  *   return unknow keyword, UNKW, if not found.
1265  *
1266  *                                              3-25-93
1267  *
1268  **********************************************************/
SrchKeyword(char * ptr,KwordBlk kwl[])1269 Int2 SrchKeyword(char* ptr, KwordBlk kwl[])
1270 {
1271     Int2 i;
1272 
1273     for(i = 0; kwl[i].str != NULL; i++)
1274         if(StringNCmp(ptr, kwl[i].str, kwl[i].len) == 0)
1275             break;
1276 
1277     if(kwl[i].str == NULL)
1278         return(ParFlat_UNKW);
1279     return(i);
1280 }
1281 
1282 /**********************************************************/
CheckLineType(char * ptr,Int4 line,KwordBlk kwl[],bool after_origin)1283 bool CheckLineType(char* ptr, Int4 line, KwordBlk kwl[], bool after_origin)
1284 {
1285     char* p;
1286     Char    msg[51];
1287     Int2    i;
1288 
1289     if(after_origin)
1290     {
1291         for(p = ptr; *p >= '0' && *p <= '9';)
1292             p++;
1293         if(*p == ' ')
1294             return true;
1295     }
1296 
1297     for(i = 0; kwl[i].str != NULL; i++)
1298         if(StringNCmp(ptr, kwl[i].str, kwl[i].len) == 0)
1299             break;
1300     if(kwl[i].str != NULL)
1301         return true;
1302 
1303     StringNCpy(msg, StringSave(ptr), 50);
1304     msg[50] = '\0';
1305     p = StringChr(msg, '\n');
1306     if(p != NULL)
1307         *p = '\0';
1308     ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
1309               "Unknown linetype \"%s\". Line number %d.", msg, line);
1310     if(p != NULL)
1311         *p = '\n';
1312 
1313     return false;
1314 }
1315 
1316 /**********************************************************
1317  *
1318  *   char* SrchNodeType(entry, type, len):
1319  *
1320  *      Return a memory location of the node which has
1321  *   the "type".
1322  *
1323  **********************************************************/
SrchNodeType(DataBlkPtr entry,Int4 type,size_t * len)1324 char* SrchNodeType(DataBlkPtr entry, Int4 type, size_t* len)
1325 {
1326     DataBlkPtr temp;
1327 
1328     temp = TrackNodeType(entry, (Int2) type);
1329     if(temp != NULL)
1330     {
1331         *len = temp->len;
1332         return(temp->offset);
1333     }
1334 
1335     *len = 0;
1336     return(NULL);
1337 }
1338 
1339 /**********************************************************
1340  *
1341  *   DataBlkPtr TrackNodeType(entry, type):
1342  *
1343  *      Return a pointer points to the Node which has
1344  *   the "type".
1345  *
1346  **********************************************************/
TrackNodeType(DataBlkPtr entry,Int2 type)1347 DataBlkPtr TrackNodeType(DataBlkPtr entry, Int2 type)
1348 {
1349     DataBlkPtr  temp;
1350     EntryBlkPtr ebp;
1351 
1352     ebp = (EntryBlkPtr) entry->data;
1353     temp = (DataBlkPtr) ebp->chain;
1354     while(temp != NULL && temp->type != type)
1355         temp = temp->next;
1356 
1357     return(temp);
1358 }
1359 
1360 /**********************************************************/
fta_tpa_keywords_check(const TKeywordList & kwds)1361 bool fta_tpa_keywords_check(const TKeywordList& kwds)
1362 {
1363     const char* b[4];
1364 
1365     bool kwd_tpa = false;
1366     bool kwd_party = false;
1367     bool kwd_inf = false;
1368     bool kwd_exp = false;
1369     bool kwd_asm = false;
1370     bool kwd_spedb = false;
1371     bool ret = true;
1372 
1373     Int4    j;
1374     Int2    i;
1375 
1376     if(kwds.empty())
1377         return true;
1378 
1379     size_t len = 0;
1380     j = 0;
1381     ITERATE(TKeywordList, key, kwds)
1382     {
1383         if(key->empty())
1384             continue;
1385 
1386         const char* p = key->c_str();
1387         i = MatchArrayIString(ParFlat_TPA_kw_array, p);
1388         if(i == 0)
1389             kwd_tpa = true;
1390         else if(i == 1 || i == 2)
1391             kwd_party = true;
1392         else if(i == 3)
1393             kwd_inf = true;
1394         else if(i == 4)
1395             kwd_exp = true;
1396         else if(i == 5 || i == 6)
1397             kwd_asm = true;
1398         else if(i == 7)
1399             kwd_spedb = true;
1400         else if (NStr::EqualNocase(p, 0, 3, "TPA"))
1401         {
1402             if(p[3] == ':')
1403             {
1404                 ErrPostEx(SEV_REJECT, ERR_KEYWORD_InvalidTPATier,
1405                           "Keyword \"%s\" is not a valid TPA-tier keyword.",
1406                           p);
1407                 ret = false;
1408             }
1409             else if(p[3] != '\0' && p[4] != '\0')
1410             {
1411                 ErrPostEx(SEV_WARNING, ERR_KEYWORD_UnexpectedTPA,
1412                           "Keyword \"%s\" looks like it might be TPA-related, but it is not a recognized TPA keyword.",
1413                           p);
1414             }
1415         }
1416         if(i > 2 && i < 8 && j < 4)
1417         {
1418             b[j] = p;
1419             ++j;
1420             len += key->size() + 1;
1421         }
1422     }
1423 
1424     if(kwd_tpa && !kwd_party)
1425     {
1426         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords,
1427                   "This TPA-record should have keyword \"Third Party Annotation\" or \"Third Party Data\" in addition to \"TPA\".");
1428         ret = false;
1429     }
1430     else if(!kwd_tpa && kwd_party)
1431     {
1432         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords,
1433                   "This TPA-record should have keyword \"TPA\" in addition to \"Third Party Annotation\" or \"Third Party Data\".");
1434         ret = false;
1435     }
1436     if(!kwd_tpa && (kwd_inf || kwd_exp))
1437     {
1438         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords,
1439                   "This TPA-record should have keyword \"TPA\" in addition to its TPA-tier keyword.");
1440         ret = false;
1441     }
1442     else if(kwd_tpa && kwd_inf == false && kwd_exp == false &&
1443             kwd_asm == false && kwd_spedb == false)
1444     {
1445         ErrPostEx(SEV_ERROR, ERR_KEYWORD_MissingTPATier,
1446                   "This TPA record lacks a keyword to indicate which tier it belongs to: experimental, inferential, reassembly or specialist_db.");
1447     }
1448     if(j > 1)
1449     {
1450         std::string buf;
1451         for(i = 0; i < j; i++)
1452         {
1453             if(i > 0)
1454                 buf += ';';
1455             buf += b[i];
1456         }
1457         ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingTPATiers,
1458                   "Keywords for multiple TPA tiers exist on this record: \"%s\". A TPA record can only be in one tier.",
1459                   buf.c_str());
1460         ret = false;
1461     }
1462 
1463     return(ret);
1464 }
1465 
1466 /**********************************************************/
fta_tsa_keywords_check(const TKeywordList & kwds,Parser::ESource source)1467 bool fta_tsa_keywords_check(const TKeywordList& kwds, Parser::ESource source)
1468 {
1469     bool kwd_tsa = false;
1470     bool kwd_assembly = false;
1471     bool ret = true;
1472     Int2 i;
1473 
1474     if(kwds.empty())
1475         return true;
1476 
1477     ITERATE(TKeywordList, key, kwds)
1478     {
1479         if(key->empty())
1480             continue;
1481         i = MatchArrayIString(ParFlat_TSA_kw_array, key->c_str());
1482         if(i == 0)
1483             kwd_tsa = true;
1484         else if(i == 1)
1485             kwd_assembly = true;
1486         else if(source == Parser::ESource::EMBL &&
1487                 NStr::EqualNocase(*key, "Transcript Shotgun Assembly"))
1488             kwd_assembly = true;
1489     }
1490 
1491     if(kwd_tsa && !kwd_assembly)
1492     {
1493         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords,
1494                   "This TSA-record should have keyword \"Transcriptome Shotgun Assembly\" in addition to \"TSA\".");
1495         ret = false;
1496     }
1497     else if(!kwd_tsa && kwd_assembly)
1498     {
1499         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords,
1500                   "This TSA-record should have keyword \"TSA\" in addition to \"Transcriptome Shotgun Assembly\".");
1501         ret = false;
1502     }
1503     return(ret);
1504 }
1505 
1506 /**********************************************************/
fta_tls_keywords_check(const TKeywordList & kwds,Parser::ESource source)1507 bool fta_tls_keywords_check(const TKeywordList& kwds, Parser::ESource source)
1508 {
1509     bool kwd_tls = false;
1510     bool kwd_study = false;
1511     bool ret = true;
1512     Int2 i;
1513 
1514     if(kwds.empty())
1515         return true;
1516 
1517     ITERATE(TKeywordList, key, kwds)
1518     {
1519         if(key->empty())
1520             continue;
1521         i = MatchArrayIString(ParFlat_TLS_kw_array, key->c_str());
1522         if(i == 0)
1523             kwd_tls = true;
1524         else if(i == 1)
1525             kwd_study = true;
1526         else if(source == Parser::ESource::EMBL &&
1527                 NStr::EqualNocase(*key, "Targeted Locus Study"))
1528             kwd_study = true;
1529     }
1530 
1531     if(kwd_tls && !kwd_study)
1532     {
1533         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords,
1534                   "This TLS-record should have keyword \"Targeted Locus Study\" in addition to \"TLS\".");
1535         ret = false;
1536     }
1537     else if(!kwd_tls && kwd_study)
1538     {
1539         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords,
1540                   "This TLS-record should have keyword \"TLS\" in addition to \"Targeted Locus Study\".");
1541         ret = false;
1542     }
1543     return(ret);
1544 }
1545 
1546 /**********************************************************/
fta_is_tpa_keyword(const char * str)1547 bool fta_is_tpa_keyword(const char* str)
1548 {
1549     if(str == NULL || *str == '\0' || MatchArrayIString(ParFlat_TPA_kw_array, str) < 0)
1550         return false;
1551 
1552     return true;
1553 }
1554 
1555 /**********************************************************/
fta_is_tsa_keyword(char * str)1556 bool fta_is_tsa_keyword(char* str)
1557 {
1558     if(str == NULL || *str == '\0' || MatchArrayIString(ParFlat_TSA_kw_array, str) < 0)
1559         return false;
1560     return true;
1561 }
1562 
1563 /**********************************************************/
fta_is_tls_keyword(char * str)1564 bool fta_is_tls_keyword(char* str)
1565 {
1566     if(str == NULL || *str == '\0' || MatchArrayIString(ParFlat_TLS_kw_array, str) < 0)
1567         return false;
1568     return true;
1569 }
1570 
1571 /**********************************************************/
fta_keywords_check(const char * str,bool * estk,bool * stsk,bool * gssk,bool * htck,bool * flik,bool * wgsk,bool * tpak,bool * envk,bool * mgak,bool * tsak,bool * tlsk)1572 void fta_keywords_check(const char* str, bool* estk, bool* stsk, bool* gssk,
1573                         bool* htck, bool* flik, bool* wgsk, bool* tpak,
1574                         bool* envk, bool* mgak, bool* tsak, bool* tlsk)
1575 {
1576     if(estk != NULL && MatchArrayString(ParFlat_EST_kw_array, str) != -1)
1577         *estk = true;
1578 
1579     if(stsk != NULL && MatchArrayString(ParFlat_STS_kw_array, str) != -1)
1580         *stsk = true;
1581 
1582     if(gssk != NULL && MatchArrayString(ParFlat_GSS_kw_array, str) != -1)
1583         *gssk = true;
1584 
1585     if(htck != NULL && MatchArrayString(ParFlat_HTC_kw_array, str) != -1)
1586         *htck = true;
1587 
1588     if(flik != NULL && MatchArrayString(ParFlat_FLI_kw_array, str) != -1)
1589         *flik = true;
1590 
1591     if(wgsk != NULL && MatchArrayString(ParFlat_WGS_kw_array, str) != -1)
1592         *wgsk = true;
1593 
1594     if(tpak != NULL && MatchArrayString(ParFlat_TPA_kw_array, str) != -1)
1595         *tpak = true;
1596 
1597     if(envk != NULL && MatchArrayString(ParFlat_ENV_kw_array, str) != -1)
1598         *envk = true;
1599 
1600     if(mgak != NULL && MatchArrayString(ParFlat_MGA_kw_array, str) != -1)
1601         *mgak = true;
1602 
1603     if(tsak != NULL && MatchArrayString(ParFlat_TSA_kw_array, str) != -1)
1604         *tsak = true;
1605 
1606     if(tlsk != NULL && MatchArrayString(ParFlat_TLS_kw_array, str) != -1)
1607         *tlsk = true;
1608 }
1609 
1610 /**********************************************************/
fta_remove_keywords(Uint1 tech,TKeywordList & kwds)1611 void fta_remove_keywords(Uint1 tech, TKeywordList& kwds)
1612 {
1613     const char **b;
1614 
1615     if(kwds.empty())
1616         return;
1617 
1618     if (tech == objects::CMolInfo::eTech_est)
1619         b = ParFlat_EST_kw_array;
1620     else if (tech == objects::CMolInfo::eTech_sts)
1621         b = ParFlat_STS_kw_array;
1622     else if (tech == objects::CMolInfo::eTech_survey)
1623         b = ParFlat_GSS_kw_array;
1624     else if (tech == objects::CMolInfo::eTech_htc)
1625         b = ParFlat_HTC_kw_array;
1626     else if (tech == objects::CMolInfo::eTech_fli_cdna)
1627         b = ParFlat_FLI_kw_array;
1628     else if (tech == objects::CMolInfo::eTech_wgs)
1629         b = ParFlat_WGS_kw_array;
1630     else
1631         return;
1632 
1633     for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1634     {
1635         if (key->empty() || MatchArrayString(b, key->c_str()) != -1)
1636         {
1637             key = kwds.erase(key);
1638         }
1639         else
1640             ++key;
1641     }
1642 }
1643 
1644 /**********************************************************/
fta_remove_tpa_keywords(TKeywordList & kwds)1645 void fta_remove_tpa_keywords(TKeywordList& kwds)
1646 {
1647     if (kwds.empty())
1648         return;
1649 
1650     for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1651     {
1652         if (key->empty() || MatchArrayIString(ParFlat_TPA_kw_array_to_remove, key->c_str()) != -1)
1653         {
1654             key = kwds.erase(key);
1655         }
1656         else
1657             ++key;
1658     }
1659 }
1660 
1661 /**********************************************************/
fta_remove_tsa_keywords(TKeywordList & kwds,Parser::ESource source)1662 void fta_remove_tsa_keywords(TKeywordList& kwds, Parser::ESource source)
1663 {
1664     if (kwds.empty())
1665         return;
1666 
1667     for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1668     {
1669         if (key->empty() || MatchArrayIString(ParFlat_TSA_kw_array, key->c_str()) != -1 ||
1670             (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Transcript Shotgun Assembly")))
1671         {
1672             key = kwds.erase(key);
1673         }
1674         else
1675             ++key;
1676     }
1677 }
1678 
1679 /**********************************************************/
fta_remove_tls_keywords(TKeywordList & kwds,Parser::ESource source)1680 void fta_remove_tls_keywords(TKeywordList& kwds, Parser::ESource source)
1681 {
1682     if (kwds.empty())
1683         return;
1684 
1685     for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1686     {
1687         if (key->empty() || MatchArrayIString(ParFlat_TLS_kw_array, key->c_str()) != -1 ||
1688             (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Targeted Locus Study")))
1689         {
1690             key = kwds.erase(key);
1691         }
1692         else
1693             ++key;
1694     }
1695 }
1696 
1697 /**********************************************************/
fta_remove_env_keywords(TKeywordList & kwds)1698 void fta_remove_env_keywords(TKeywordList& kwds)
1699 {
1700     if (kwds.empty())
1701         return;
1702 
1703     for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1704     {
1705         if (key->empty() || MatchArrayIString(ParFlat_ENV_kw_array, key->c_str()) != -1)
1706         {
1707             key = kwds.erase(key);
1708         }
1709         else
1710             ++key;
1711     }
1712 }
1713 
1714 /**********************************************************/
check_est_sts_gss_tpa_kwds(ValNodePtr kwds,size_t len,IndexblkPtr entry,bool tpa_check,bool & specialist_db,bool & inferential,bool & experimental,bool & assembly)1715 void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry,
1716                                 bool tpa_check, bool &specialist_db,
1717                                 bool &inferential, bool &experimental,
1718                                 bool &assembly)
1719 {
1720     char* line;
1721     char* p;
1722     char* q;
1723 
1724     if(kwds == NULL || kwds->data.ptrvalue == NULL || len < 1)
1725         return;
1726 
1727     line = (char*) MemNew(len + 1);
1728     line[0] = '\0';
1729     for(; kwds != NULL; kwds = kwds->next)
1730     {
1731         StringCat(line, (const char *) kwds->data.ptrvalue);
1732     }
1733     for(p = line; *p != '\0'; p++)
1734         if(*p == '\n' || *p == '\t')
1735             *p = ' ';
1736     for(p = line; *p == ' ' || *p == '.' || *p == ';';)
1737         p++;
1738     if(*p == '\0')
1739     {
1740         MemFree(line);
1741         return;
1742     }
1743     for(q = p; *q != '\0';)
1744         q++;
1745     for(q--; *q == ' ' || *q == '.' || *q == ';'; q--)
1746         *q = '\0';
1747     for(q = p, p = line; *q != '\0';)
1748     {
1749         if(*q != ' ' && *q != ';')
1750         {
1751             *p++ = *q++;
1752             continue;
1753         }
1754         if(*q == ' ')
1755         {
1756             for(q++; *q == ' ';)
1757                 q++;
1758             if(*q != ';')
1759                 *p++ = ' ';
1760         }
1761         if(*q == ';')
1762         {
1763             *p++ = *q++;
1764             while(*q == ' ' || *q == ';')
1765                 q++;
1766         }
1767     }
1768     *p++ = ';';
1769     *p = '\0';
1770     for(p = line;; p = q + 1)
1771     {
1772         q = StringChr(p, ';');
1773         if(q == NULL)
1774             break;
1775         *q = '\0';
1776         fta_keywords_check(p, &entry->EST, &entry->STS, &entry->GSS,
1777                            &entry->HTC, NULL, NULL,
1778                            (tpa_check ? &entry->is_tpa : NULL),
1779                            NULL, NULL, NULL, NULL);
1780         if(NStr::EqualNocase(p, "TPA:specialist_db") ||
1781            NStr::EqualNocase(p, "TPA:assembly"))
1782         {
1783             specialist_db = true;
1784             if(NStr::EqualNocase(p, "TPA:assembly"))
1785                 assembly = true;
1786         }
1787         else if(NStr::EqualNocase(p, "TPA:inferential"))
1788             inferential = true;
1789         else if(NStr::EqualNocase(p, "TPA:experimental"))
1790             experimental = true;
1791     }
1792     MemFree(line);
1793 }
1794 
1795 /**********************************************************/
fta_operon_free(FTAOperonPtr fop)1796 void fta_operon_free(FTAOperonPtr fop)
1797 {
1798     FTAOperonPtr fopnext;
1799 
1800     for(; fop != NULL; fop = fopnext)
1801     {
1802         fopnext = fop->next;
1803         if(fop->strloc != NULL)
1804             MemFree(fop->strloc);
1805         delete fop;
1806     }
1807 }
1808 
1809 /**********************************************************/
ConstructValNode(ValNodePtr head,Uint1 choice,void * data)1810 ValNodePtr ConstructValNode(ValNodePtr head, Uint1 choice, void* data)
1811 {
1812     ValNodePtr res;
1813 
1814     res = ValNodeNew(head);
1815     res->choice = choice;
1816     res->data.ptrvalue = data;
1817     res->next = NULL;
1818     return(res);
1819 }
1820 
1821 /**********************************************************/
ConstructValNodeInt(ValNodePtr head,Uint1 choice,Int4 data)1822 ValNodePtr ConstructValNodeInt(ValNodePtr head, Uint1 choice, Int4 data)
1823 {
1824     ValNodePtr res;
1825 
1826     res = ValNodeNew(head);
1827     res->choice = choice;
1828     res->data.intvalue = data;
1829     res->next = NULL;
1830     return(res);
1831 }
1832 
1833 /**********************************************************/
fta_check_mga_keywords(objects::CMolInfo & mol_info,const TKeywordList & kwds)1834 bool fta_check_mga_keywords(objects::CMolInfo& mol_info, const TKeywordList& kwds)
1835 {
1836     bool is_cage;
1837     bool is_sage;
1838 
1839     TKeywordList::const_iterator key_it = kwds.end();
1840 
1841     bool got = false;
1842     if (!kwds.empty() && NStr::EqualNocase(kwds.front(), "MGA"))
1843     {
1844         ITERATE(TKeywordList, key, kwds)
1845         {
1846             if(MatchArrayIString(ParFlat_MGA_more_kw_array,
1847                                  key->c_str()) < 0)
1848                 continue;
1849             got = true;
1850             key_it = key;
1851             break;
1852         }
1853     }
1854 
1855     if(!got)
1856     {
1857         ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingMGAKeywords,
1858                   "This is apparently a CAGE record, but it lacks the required keywords. Entry dropped.");
1859         return false;
1860     }
1861 
1862     if (!mol_info.IsSetTechexp() || !kwds.empty() ||
1863         mol_info.GetTechexp() != "cage")
1864         return true;
1865 
1866     for (is_sage = false, is_cage = false; key_it != kwds.end(); ++key_it)
1867     {
1868         const char* p = key_it->c_str();
1869 
1870         if (NStr::EqualNocase(p, "5'-SAGE"))
1871             is_sage = true;
1872         else if (NStr::EqualNocase(p, "CAGE (Cap Analysis Gene Expression)"))
1873             is_cage = true;
1874     }
1875 
1876     if(is_sage)
1877     {
1878         if(is_cage)
1879         {
1880             ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingMGAKeywords,
1881                       "This MGA record contains more than one of the special keywords indicating different techniques.");
1882             return false;
1883         }
1884         mol_info.SetTechexp("5'-sage");
1885     }
1886 
1887     return true;
1888 }
1889 
1890 /**********************************************************/
fta_StringCpy(char * dst,char * src)1891 void fta_StringCpy(char* dst, char* src)
1892 {
1893     char* p;
1894     char* q;
1895 
1896     for(q = dst, p = src; *p != '\0';)
1897         *q++ = *p++;
1898     *q = '\0';
1899 }
1900 
1901 /**********************************************************/
SetTextId(Uint1 seqtype,objects::CSeq_id & seqId,objects::CTextseq_id & textId)1902 bool SetTextId(Uint1 seqtype, objects::CSeq_id& seqId, objects::CTextseq_id& textId)
1903 {
1904     bool wasSet = true;
1905 
1906     switch (seqtype)
1907     {
1908     case objects::CSeq_id::e_Genbank:
1909         seqId.SetGenbank(textId);
1910         break;
1911     case objects::CSeq_id::e_Embl:
1912         seqId.SetEmbl(textId);
1913         break;
1914     case objects::CSeq_id::e_Pir:
1915         seqId.SetPir(textId);
1916         break;
1917     case objects::CSeq_id::e_Swissprot:
1918         seqId.SetSwissprot(textId);
1919         break;
1920     case objects::CSeq_id::e_Other:
1921         seqId.SetOther(textId);
1922         break;
1923     case objects::CSeq_id::e_Ddbj:
1924         seqId.SetDdbj(textId);
1925         break;
1926     case objects::CSeq_id::e_Prf:
1927         seqId.SetPrf(textId);
1928         break;
1929     case objects::CSeq_id::e_Pdb:
1930     {
1931         // TODO: test this branch
1932         objects::CPDB_seq_id pdbId;
1933         pdbId.SetChain_id(0);
1934         seqId.SetPdb(pdbId);
1935     }
1936     break;
1937     case objects::CSeq_id::e_Tpg:
1938         seqId.SetTpg(textId);
1939         break;
1940     case objects::CSeq_id::e_Tpe:
1941         seqId.SetTpe(textId);
1942         break;
1943     case objects::CSeq_id::e_Tpd:
1944         seqId.SetTpd(textId);
1945         break;
1946     case objects::CSeq_id::e_Gpipe:
1947         seqId.SetGpipe(textId);
1948         break;
1949     case objects::CSeq_id::e_Named_annot_track:
1950         seqId.SetNamed_annot_track(textId);
1951         break;
1952 
1953     default:
1954         wasSet = false;
1955     }
1956 
1957     return wasSet;
1958 }
1959 
1960 /**********************************************************/
IsCancelled(const TKeywordList & keywords)1961 bool IsCancelled(const TKeywordList& keywords)
1962 {
1963     ITERATE(TKeywordList, key, keywords)
1964     {
1965         if (NStr::EqualNocase(*key, "HTGS_CANCELLED"))
1966             return true;
1967     }
1968 
1969     return false;
1970 }
1971 
1972 /**********************************************************/
HasHtg(const TKeywordList & keywords)1973 bool HasHtg(const TKeywordList& keywords)
1974 {
1975     ITERATE(TKeywordList, key, keywords)
1976     {
1977         if (*key == "HTG" || *key == "HTGS_PHASE0" ||
1978             *key == "HTGS_PHASE1" || *key == "HTGS_PHASE2" ||
1979             *key == "HTGS_PHASE3")
1980         {
1981             return true;
1982         }
1983     }
1984 
1985     return false;
1986 }
1987 
1988 /**********************************************************/
RemoveHtgPhase(TKeywordList & keywords)1989 void RemoveHtgPhase(TKeywordList& keywords)
1990 {
1991     for (TKeywordList::iterator key = keywords.begin(); key != keywords.end();)
1992     {
1993         const char* p = key->c_str();
1994         if (NStr::EqualNocase(p, 0, 10, "HTGS_PHASE") &&
1995             (p[10] == '0' || p[10] == '1' || p[10] == '2' ||
1996             p[10] == '3') && p[11] == '\0')
1997         {
1998             key = keywords.erase(key);
1999         }
2000         else
2001             ++key;
2002     }
2003 }
2004 
2005 /**********************************************************/
HasHtc(const TKeywordList & keywords)2006 bool HasHtc(const TKeywordList& keywords)
2007 {
2008     ITERATE(TKeywordList, key, keywords)
2009     {
2010         if (NStr::EqualNocase(*key, "HTC"))
2011         {
2012             return true;
2013         }
2014     }
2015 
2016     return false;
2017 }
2018 
2019 END_NCBI_SCOPE
2020