1 /* utilfun.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: utilfun.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Utility functions for parser and indexing.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39
40 #include <corelib/ncbistr.hpp>
41 #include <objmgr/scope.hpp>
42 #include <objmgr/object_manager.hpp>
43 #include <objects/seq/MolInfo.hpp>
44 #include <objects/seqloc/PDB_seq_id.hpp>
45 #include <corelib/tempstr.hpp>
46
47 #include "index.h"
48
49 #include "ftaerr.hpp"
50 #include "indx_def.h"
51 #include "utilfun.h"
52
53 #ifdef THIS_FILE
54 # undef THIS_FILE
55 #endif
56 #define THIS_FILE "utilfun.cpp"
57
58 BEGIN_NCBI_SCOPE;
59
60 USING_SCOPE(objects);
61
GetScope()62 CScope& GetScope()
63 {
64 static CScope scope(*CObjectManager::GetInstance());
65 return scope;
66 }
67
68
69 static const char *ParFlat_EST_kw_array[] = {
70 "EST",
71 "EST PROTO((expressed sequence tag)",
72 "expressed sequence tag",
73 "EST (expressed sequence tag)",
74 "EST (expressed sequence tags)",
75 "EST(expressed sequence tag)",
76 "transcribed sequence fragment",
77 NULL
78 };
79
80 static const char *ParFlat_GSS_kw_array[] = {
81 "GSS",
82 "GSS (genome survey sequence)",
83 "trapped exon",
84 NULL
85 };
86
87 static const char *ParFlat_STS_kw_array[] = {
88 "STS",
89 "STS(sequence tagged site)",
90 "STS (sequence tagged site)",
91 "STS sequence",
92 "sequence tagged site",
93 NULL
94 };
95
96 static const char *ParFlat_HTC_kw_array[] = {
97 "HTC",
98 NULL
99 };
100
101 static const char *ParFlat_FLI_kw_array[] = {
102 "FLI_CDNA",
103 NULL
104 };
105
106 static const char *ParFlat_WGS_kw_array[] = {
107 "WGS",
108 NULL
109 };
110
111 static const char *ParFlat_MGA_kw_array[] = {
112 "MGA",
113 "CAGE (Cap Analysis Gene Expression)",
114 "5'-SAGE",
115 NULL
116 };
117
118 static const char *ParFlat_MGA_more_kw_array[] = {
119 "CAGE (Cap Analysis Gene Expression)",
120 "5'-SAGE",
121 "5'-end tag",
122 "unspecified tag",
123 "small RNA",
124 NULL
125 };
126
127 /* Any change of contents of next array below requires proper
128 * modifications in function fta_tsa_keywords_check().
129 */
130 static const char *ParFlat_TSA_kw_array[] = {
131 "TSA",
132 "Transcriptome Shotgun Assembly",
133 NULL
134 };
135
136 /* Any change of contents of next array below requires proper
137 * modifications in function fta_tls_keywords_check().
138 */
139 static const char *ParFlat_TLS_kw_array[] = {
140 "TLS",
141 "Targeted Locus Study",
142 NULL
143 };
144
145 /* Any change of contents of next 2 arrays below requires proper
146 * modifications in function fta_tpa_keywords_check().
147 */
148 static const char *ParFlat_TPA_kw_array[] = {
149 "TPA",
150 "THIRD PARTY ANNOTATION",
151 "THIRD PARTY DATA",
152 "TPA:INFERENTIAL",
153 "TPA:EXPERIMENTAL",
154 "TPA:REASSEMBLY",
155 "TPA:ASSEMBLY",
156 "TPA:SPECIALIST_DB",
157 NULL
158 };
159
160 static const char *ParFlat_TPA_kw_array_to_remove[] = {
161 "TPA",
162 "THIRD PARTY ANNOTATION",
163 "THIRD PARTY DATA",
164 NULL
165 };
166
167 static const char *ParFlat_ENV_kw_array[] = {
168 "ENV",
169 NULL
170 };
171
172 /**********************************************************/
FTAitoa(Int4 m)173 static std::string FTAitoa(Int4 m)
174 {
175 Int4 sign = (m < 0) ? -1 : 1;
176 std::string res;
177
178 for(m *= sign; m > 9; m /= 10)
179 res += m % 10 + '0';
180
181 res += m + '0';
182
183 if(sign < 0)
184 res += '-';
185
186 std::reverse(res.begin(), res.end());
187 return res;
188 }
189
190 /**********************************************************/
UnwrapAccessionRange(const objects::CGB_block::TExtra_accessions & extra_accs,objects::CGB_block::TExtra_accessions & hist)191 void UnwrapAccessionRange(const objects::CGB_block::TExtra_accessions& extra_accs, objects::CGB_block::TExtra_accessions& hist)
192 {
193 Int4 num1;
194 Int4 num2;
195
196 objects::CGB_block::TExtra_accessions ret;
197
198 ITERATE(objects::CGB_block::TExtra_accessions, acc, extra_accs)
199 {
200 std::string str = *acc;
201 if (str.empty())
202 continue;
203
204 size_t dash = str.find('-');
205 if (dash == std::string::npos)
206 {
207 ret.push_back(str);
208 continue;
209 }
210
211 std::string first(str.begin(), str.begin() + dash),
212 last(str.begin() + dash + 1, str.end());
213 size_t acclen = first.size();
214
215 const Char* p = first.c_str();
216 for (; (*p >= 'A' && *p <= 'Z') || *p == '_';)
217 p++;
218
219 size_t preflen = p - first.c_str();
220
221 std::string prefix = first.substr(0, preflen);
222 while(*p == '0')
223 p++;
224
225 const Char* q = p;
226 for (q = p; *p >= '0' && *p <= '9';)
227 p++;
228 num1 = atoi(q);
229
230 for (p = last.c_str() + preflen; *p == '0';)
231 p++;
232 for(q = p; *p >= '0' && *p <= '9';)
233 p++;
234 num2 = atoi(q);
235
236 ret.push_back(first);
237
238 if(num1 == num2)
239 continue;
240
241 for (num1++; num1 <= num2; num1++)
242 {
243 std::string new_acc = prefix;
244
245 std::string num_str = FTAitoa(num1);
246 size_t j = acclen - preflen - num_str.size();
247
248 for(size_t i = 0; i < j; i++)
249 new_acc += '0';
250
251 new_acc += num_str;
252 ret.push_back(new_acc);
253 }
254 }
255
256 ret.swap(hist);
257 }
258
sIsPrefixChar(char c)259 static bool sIsPrefixChar(char c) {
260 return ('A' <= c && c <= 'Z') || c == '_';
261 }
262 /**********************************************************/
ParseAccessionRange(list<string> & tokens,int skip)263 bool ParseAccessionRange(list<string>& tokens, int skip)
264 {
265 bool bad = false;
266
267 if (tokens.empty()) {
268 return true;
269 }
270
271 if (tokens.size() <= skip+1) {
272 return true;
273 }
274
275
276
277 auto it = tokens.begin();
278 if (skip) {
279 advance(it, skip);
280 }
281
282 for (; it != tokens.end(); ++it) {
283 const auto& token = *it;
284 if (token.empty()) {
285 continue;
286 }
287
288 CTempString first, last;
289 if (!NStr::SplitInTwo(token, "-", first, last)) {
290 continue;
291 }
292 if (first.size() != last.size()) {
293 bad = true;
294 break;
295 }
296
297 auto first_it =
298 find_if_not(begin(first), end(first), sIsPrefixChar);
299
300 if (first_it == first.end()) {
301 bad = true;
302 break;
303 }
304
305
306 auto last_it =
307 find_if_not(begin(last), end(last), sIsPrefixChar);
308 if (last_it == last.end()) {
309 bad = true;
310 break;
311 }
312
313 auto prefixLength = distance(first.begin(), first_it);
314 if (prefixLength != distance(last.begin(), last_it) ||
315 !NStr::EqualCase(first, 0, prefixLength, last.substr(0, prefixLength))) {
316 ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch,
317 "Inconsistent prefix found in secondary accession range \"%s\".",
318 token.c_str());
319 break;
320 }
321
322 auto num1 = NStr::StringToInt(first.substr(prefixLength));
323 auto num2 = NStr::StringToInt(last.substr(prefixLength));
324
325 if (num2 <= num1) {
326 ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
327 "Invalid start/end values in secondary accession range \"%s\".",
328 token.c_str());
329 }
330
331 *it = first;
332 it = tokens.insert(it, "-");
333 it = tokens.insert(it, last);
334 }
335
336
337 /*
338 for(bad = false; tbp != NULL; tbp = tbpnext)
339 {
340 tbpnext = tbp->next;
341 if(tbp->str == NULL)
342 continue;
343 dash = StringChr(tbp->str, '-');
344 if(dash == NULL)
345 continue;
346 *dash = '\0';
347 first = tbp->str;
348 last = dash + 1;
349 if(StringLen(first) != StringLen(last) || *first < 'A' ||
350 *first > 'Z' || *last < 'A' || *last > 'Z')
351 {
352 *dash = '-';
353 bad = true;
354 break;
355 }
356
357 for(p = first; (*p >= 'A' && *p <= 'Z') || *p == '_';)
358 p++;
359 if(*p < '0' || *p > '9')
360 {
361 *dash = '-';
362 bad = true;
363 break;
364 }
365 for(q = last; (*q >= 'A' && *q <= 'Z') || *q == '_';)
366 q++;
367 if(*q < '0' || *q > '9')
368 {
369 *dash = '-';
370 bad = true;
371 break;
372 }
373 size_t preflen = p - first;
374 if(preflen != (size_t) (q - last) || StringNCmp(first, last, preflen) != 0)
375 {
376 *dash = '-';
377 ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch,
378 "Inconsistent prefix found in secondary accession range \"%s\".",
379 tbp->str);
380 break;
381 }
382
383
384 while(*p == '0') // ignore all the zeros
385 p++;
386 for(q = p; *p >= '0' && *p <= '9';)
387 p++;
388 if(*p != '\0')
389 {
390 *dash = '-';
391 bad = true;
392 break;
393 }
394 num1 = atoi(q); // the first number
395
396 for(p = last + preflen; *p == '0';)
397 p++;
398 for(q = p; *p >= '0' && *p <= '9';)
399 p++;
400 if(*p != '\0')
401 {
402 *dash = '-';
403 bad = true;
404 break;
405 }
406 num2 = atoi(q);
407
408 if(num1 > num2)
409 {
410 *dash = '-';
411 ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
412 "Invalid start/end values in secondary accession range \"%s\".",
413 tbp->str);
414 break;
415 }
416
417 tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
418 tbp = tbp->next;
419 tbp->str = StringSave("-");
420 tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
421 tbp = tbp->next;
422 tbp->str = StringSave(last);
423 tsbp->num += 2;
424
425 tbp->next = tbpnext;
426 }
427 if(tbp == NULL)
428 return true;
429 */
430 if(bad)
431 {
432 ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
433 "Incorrect secondary accession range provided: \"%s\".",
434 it->c_str());
435 }
436 return false;
437 }
438
439 /**********************************************************/
ParseAccessionRange(TokenStatBlkPtr tsbp,Int4 skip)440 bool ParseAccessionRange(TokenStatBlkPtr tsbp, Int4 skip)
441 {
442 TokenBlkPtr tbp;
443 TokenBlkPtr tbpnext;
444 char* dash;
445 char* first;
446 char* last;
447 char* p;
448 char* q;
449 bool bad;
450 Int4 num1;
451 Int4 num2;
452
453 if(tsbp->list == NULL)
454 return true;
455
456 tbp = NULL;
457 if(skip == 0)
458 tbp = tsbp->list;
459 else if(skip == 1)
460 {
461 if(tsbp->list != NULL)
462 tbp = tsbp->list->next;
463 }
464 else
465 {
466 if(tsbp->list != NULL && tsbp->list->next != NULL)
467 tbp = tsbp->list->next->next;
468 }
469 if(tbp == NULL)
470 return true;
471
472 for(bad = false; tbp != NULL; tbp = tbpnext)
473 {
474 tbpnext = tbp->next;
475 if(tbp->str == NULL)
476 continue;
477 dash = StringChr(tbp->str, '-');
478 if(dash == NULL)
479 continue;
480 *dash = '\0';
481 first = tbp->str;
482 last = dash + 1;
483 if(StringLen(first) != StringLen(last) || *first < 'A' ||
484 *first > 'Z' || *last < 'A' || *last > 'Z')
485 {
486 *dash = '-';
487 bad = true;
488 break;
489 }
490
491 for(p = first; (*p >= 'A' && *p <= 'Z') || *p == '_';)
492 p++;
493 if(*p < '0' || *p > '9')
494 {
495 *dash = '-';
496 bad = true;
497 break;
498 }
499 for(q = last; (*q >= 'A' && *q <= 'Z') || *q == '_';)
500 q++;
501 if(*q < '0' || *q > '9')
502 {
503 *dash = '-';
504 bad = true;
505 break;
506 }
507 size_t preflen = p - first;
508 if(preflen != (size_t) (q - last) || StringNCmp(first, last, preflen) != 0)
509 {
510 *dash = '-';
511 ErrPostEx(SEV_REJECT, ERR_ACCESSION_2ndAccPrefixMismatch,
512 "Inconsistent prefix found in secondary accession range \"%s\".",
513 tbp->str);
514 break;
515 }
516
517 while(*p == '0')
518 p++;
519 for(q = p; *p >= '0' && *p <= '9';)
520 p++;
521 if(*p != '\0')
522 {
523 *dash = '-';
524 bad = true;
525 break;
526 }
527 num1 = atoi(q);
528
529 for(p = last + preflen; *p == '0';)
530 p++;
531 for(q = p; *p >= '0' && *p <= '9';)
532 p++;
533 if(*p != '\0')
534 {
535 *dash = '-';
536 bad = true;
537 break;
538 }
539 num2 = atoi(q);
540
541 if(num1 > num2)
542 {
543 *dash = '-';
544 ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
545 "Invalid start/end values in secondary accession range \"%s\".",
546 tbp->str);
547 break;
548 }
549
550 tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
551 tbp = tbp->next;
552 tbp->str = StringSave("-");
553 tbp->next = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
554 tbp = tbp->next;
555 tbp->str = StringSave(last);
556 tsbp->num += 2;
557
558 tbp->next = tbpnext;
559 }
560 if(tbp == NULL)
561 return true;
562 if(bad)
563 {
564 ErrPostEx(SEV_REJECT, ERR_ACCESSION_Invalid2ndAccRange,
565 "Incorrect secondary accession range provided: \"%s\".",
566 tbp->str);
567 }
568 return false;
569 }
570
571 /**********************************************************/
TokenNodeNew(TokenBlkPtr tbp)572 static TokenBlkPtr TokenNodeNew(TokenBlkPtr tbp)
573 {
574 TokenBlkPtr newnode = (TokenBlkPtr) MemNew(sizeof(TokenBlk));
575
576 if(tbp != NULL)
577 {
578 while(tbp->next != NULL)
579 tbp = tbp->next;
580 tbp->next = newnode;
581 }
582
583 return(newnode);
584 }
585
586 /**********************************************************/
InsertTokenVal(TokenBlkPtr * tbp,char * str)587 static void InsertTokenVal(TokenBlkPtr* tbp, char* str)
588 {
589 TokenBlkPtr ltbp;
590
591 ltbp = *tbp;
592 ltbp = TokenNodeNew(ltbp);
593 ltbp->str = StringSave(str);
594
595 if(*tbp == NULL)
596 *tbp = ltbp;
597 }
598
599 /**********************************************************
600 *
601 * TokenStatBlkPtr TokenString(str, delimiter):
602 *
603 * Parsing string "str" by delimiter or tab key, blank.
604 * Parsing stop at newline ('\n') or end of string ('\0').
605 * Return a statistics of link list token.
606 *
607 **********************************************************/
TokenString(char * str,Char delimiter)608 TokenStatBlkPtr TokenString(char* str, Char delimiter)
609 {
610 char* bptr;
611 char* ptr;
612 char* curtoken;
613 Int2 num;
614 TokenStatBlkPtr token;
615 Char ch;
616
617 token = (TokenStatBlkPtr) MemNew(sizeof(TokenStatBlk));
618
619 /* skip first several delimiters if any existed
620 */
621 for(ptr = str; *ptr == delimiter;)
622 ptr++;
623
624 for(num = 0; *ptr != '\0' && *ptr != '\n';)
625 {
626 for(bptr = ptr; *ptr != delimiter && *ptr != '\n' &&
627 *ptr != '\t' && *ptr != ' ' && *ptr != '\0';)
628 ptr++;
629
630 ch = *ptr;
631 *ptr = '\0';
632 curtoken = StringSave(bptr);
633 *ptr = ch;
634
635 InsertTokenVal(&token->list, curtoken);
636 num++;
637 MemFree(curtoken);
638
639 while(*ptr == delimiter || *ptr == '\t' || *ptr == ' ')
640 ptr++;
641 }
642
643 token->num = num;
644
645 return(token);
646 }
647
648 /**********************************************************
649 *
650 * TokenStatBlkPtr TokenStringByDelimiter(str, delimiter):
651 *
652 * Parsing string "str" by delimiter.
653 * Parsing stop at end of string ('\0').
654 * Return a statistics of link list token.
655 *
656 **********************************************************/
TokenStringByDelimiter(char * str,Char delimiter)657 TokenStatBlkPtr TokenStringByDelimiter(char* str, Char delimiter)
658 {
659 char* bptr;
660 char* ptr;
661 char* curtoken;
662 char* s;
663 Int2 num;
664 TokenStatBlkPtr token;
665 Char ch;
666
667 token = (TokenStatBlkPtr) MemNew(sizeof(TokenStatBlk));
668
669 /* skip first several delimiters if any existed
670 */
671 for(ptr = str; *ptr == delimiter;)
672 ptr++;
673
674 /* remove '.' from the end of the string
675 */
676 s = ptr + StringLen(ptr) - 1;
677 if(*s == '.')
678 *s = '\0';
679
680 for(num = 0; *ptr != '\0';)
681 {
682 for(bptr = ptr; *ptr != delimiter && *ptr != '\0';)
683 ptr++;
684
685 ch = *ptr;
686 *ptr = '\0';
687 curtoken = StringSave(bptr);
688 *ptr = ch;
689
690 InsertTokenVal(&token->list, curtoken);
691 num++;
692 MemFree(curtoken);
693
694 while(*ptr == delimiter || *ptr == ' ')
695 ptr++;
696 }
697
698 token->num = num;
699
700 return(token);
701 }
702
703 /**********************************************************/
FreeTokenblk(TokenBlkPtr tbp)704 void FreeTokenblk(TokenBlkPtr tbp)
705 {
706 TokenBlkPtr temp;
707
708 while(tbp != NULL)
709 {
710 temp = tbp;
711 tbp = tbp->next;
712 MemFree(temp->str);
713 MemFree(temp);
714 }
715 }
716
717 /**********************************************************/
FreeTokenstatblk(TokenStatBlkPtr tsbp)718 void FreeTokenstatblk(TokenStatBlkPtr tsbp)
719 {
720 FreeTokenblk(tsbp->list);
721 MemFree(tsbp);
722 }
723
724 /**********************************************************
725 *
726 * Int2 fta_StringMatch(array, text):
727 *
728 * Return array position of the matched length
729 * of string in array.
730 * Return -1 if no match.
731 *
732 **********************************************************/
fta_StringMatch(const Char ** array,const Char * text)733 Int2 fta_StringMatch(const Char **array, const Char* text)
734 {
735 Int2 i;
736
737 if(text == NULL)
738 return(-1);
739
740 for (i = 0; *array != NULL; i++, array++)
741 {
742 if (NStr::EqualCase(text, 0, StringLen(*array), *array))
743 break;
744 }
745
746 if(*array == NULL)
747 return(-1);
748
749 return(i);
750 }
751
752 /**********************************************************
753 *
754 * Int2 StringMatchIcase(array, text):
755 *
756 * Return array position of the matched lenght of
757 * string (ignored case) in array.
758 * Return -1 if no match.
759 *
760 **********************************************************/
StringMatchIcase(const Char ** array,const Char * text)761 Int2 StringMatchIcase(const Char **array, const Char* text)
762 {
763 Int2 i;
764
765 if(text == NULL)
766 return(-1);
767
768 for (i = 0; *array != NULL; i++, array++)
769 {
770 // If string from an array is empty its length == 0 and would be equval to any other string
771 // The next 'if' statement will avoid that behavior
772 if (text[0] != 0 && *array[0] == 0)
773 continue;
774
775 if (NStr::EqualNocase(text, 0, StringLen(*array), *array))
776 break;
777 }
778
779 if(*array == NULL)
780 return(-1);
781 return(i);
782 }
783
784 /**********************************************************
785 *
786 * Int2 MatchArrayString(array, text):
787 *
788 * Return array position of the string in the
789 * array.
790 * Return -1 if no match.
791 *
792 **********************************************************/
MatchArrayString(const char ** array,const char * text)793 Int2 MatchArrayString(const char **array, const char *text)
794 {
795 Int2 i;
796
797 if(text == NULL)
798 return(-1);
799
800 for (i = 0; *array != NULL; i++, array++)
801 {
802 if (NStr::Equal(*array, text))
803 break;
804 }
805
806 if(*array == NULL)
807 return(-1);
808 return(i);
809 }
810
811 /**********************************************************/
MatchArrayIString(const Char ** array,const Char * text)812 Int2 MatchArrayIString(const Char **array, const Char *text)
813 {
814 Int2 i;
815
816 if(text == NULL)
817 return(-1);
818
819 for (i = 0; *array != NULL; i++, array++)
820 {
821 // If string from an array is empty its length == 0 and would be equval to any other string
822 // The next 'if' statement will avoid that behavior
823 if (text[0] != 0 && *array[0] == 0)
824 continue;
825
826 if (NStr::EqualNocase(*array, text))
827 break;
828 }
829
830 if(*array == NULL)
831 return(-1);
832 return(i);
833 }
834
835 /**********************************************************
836 *
837 * Int2 MatchArraySubString(array, text):
838 *
839 * Return array position of the string in the array
840 * if any array is in the substring of "text".
841 * Return -1 if no match.
842 *
843 **********************************************************/
MatchArraySubString(const Char ** array,const Char * text)844 Int2 MatchArraySubString(const Char **array, const Char* text)
845 {
846 Int2 i;
847
848 if(text == NULL)
849 return(-1);
850
851 for (i = 0; *array != NULL; i++, array++)
852 {
853 if (NStr::Find(text, *array) != NPOS)
854 break;
855 }
856
857 if(*array == NULL)
858 return(-1);
859 return(i);
860 }
861
862 /**********************************************************/
StringIStr(const Char * where,const Char * what)863 Char* StringIStr(const Char* where, const Char *what)
864 {
865 const Char* p;
866 const Char* q;
867
868 if(where == NULL || *where == '\0' || what == NULL || *what == '\0')
869 return(NULL);
870
871 q = NULL;
872 for(; *where != '\0'; where++)
873 {
874 for(q = what, p = where; *q != '\0' && *p != '\0'; q++, p++)
875 {
876 if(*q == *p)
877 continue;
878
879 if(*q >= 'A' && *q <= 'Z')
880 {
881 if(*q + 32 == *p)
882 continue;
883 }
884 else if(*q >= 'a' && *q <= 'z')
885 {
886 if(*q - 32 == *p)
887 continue;
888 }
889 break;
890 }
891 if(*p == '\0' || *q == '\0')
892 break;
893 }
894 if(q != NULL && *q == '\0')
895 return const_cast<char*>(where);
896 return(NULL);
897 }
898
899 /**********************************************************/
MatchArrayISubString(const Char ** array,const Char * text)900 Int2 MatchArrayISubString(const Char **array, const Char* text)
901 {
902 Int2 i;
903
904 if(text == NULL)
905 return(-1);
906
907 for (i = 0; *array != NULL; i++, array++)
908 {
909 if (NStr::FindNoCase(text, *array) != NPOS)
910 break;
911 }
912
913 if(*array == NULL)
914 return(-1);
915 return(i);
916 }
917
918 /**********************************************************
919 *
920 * char* GetBlkDataReplaceNewLine(bptr, eptr,
921 * start_col_data):
922 *
923 * Return a string which replace newline to blank
924 * and skip "XX" line data.
925 *
926 **********************************************************/
GetBlkDataReplaceNewLine(char * bptr,char * eptr,Int2 start_col_data)927 char* GetBlkDataReplaceNewLine(char* bptr, char* eptr,
928 Int2 start_col_data)
929 {
930 char* ptr;
931
932 if(bptr + start_col_data >= eptr)
933 return(NULL);
934
935 size_t size = eptr - bptr;
936 char* retstr = (char*)MemNew(size + 1);
937 char* str = retstr;
938
939 while(bptr < eptr)
940 {
941 if (NStr::Equal(bptr, 0, 2, "XX")) /* skip XX line data */
942 {
943 ptr = SrchTheChar(bptr, eptr, '\n');
944 bptr = ptr + 1;
945 continue;
946 }
947
948 bptr += start_col_data;
949 ptr = SrchTheChar(bptr, eptr, '\n');
950
951 if(ptr != NULL)
952 {
953 size = ptr - bptr;
954 MemCpy(str, bptr, size);
955 str += size;
956 if(*(ptr - 1) != '-' || *(ptr - 2) == ' ')
957 {
958 StringCpy(str, " ");
959 str++;
960 }
961 bptr = ptr;
962 }
963 bptr++;
964 }
965
966 std::string tstr = NStr::TruncateSpaces(std::string(retstr), NStr::eTrunc_End);
967 MemFree(retstr);
968 retstr = StringSave(tstr.c_str());
969
970 return(retstr);
971 }
972
973
974 /**********************************************************/
SeekLastAlphaChar(const Char * str,size_t len)975 static size_t SeekLastAlphaChar(const Char* str, size_t len)
976 {
977 size_t ret = 0;
978 if (str != NULL && len != 0)
979 {
980 for (ret = len - 1; ret >= 0; --ret)
981 {
982 if (str[ret] != ' ' && str[ret] != '\n' && str[ret] != '\\' && str[ret] != ',' &&
983 str[ret] != ';' && str[ret] != '~' && str[ret] != '.' && str[ret] != ':')
984 {
985 ++ret;
986 break;
987 }
988 }
989
990 if (ret < 0)
991 ret = 0;
992 }
993
994 return ret;
995 }
996
997 /**********************************************************/
CleanTailNoneAlphaCharInString(std::string & str)998 void CleanTailNoneAlphaCharInString(std::string& str)
999 {
1000 size_t ret = SeekLastAlphaChar(str.c_str(), str.size());
1001 str = str.substr(0, ret);
1002 }
1003
1004 /**********************************************************
1005 *
1006 * void CleanTailNoneAlphaChar(str):
1007 *
1008 * Delete any tailing ' ', '\n', '\\', ',', ';', '~',
1009 * '.', ':' characters.
1010 *
1011 **********************************************************/
CleanTailNoneAlphaChar(char * str)1012 void CleanTailNoneAlphaChar(char* str)
1013 {
1014 if(str == NULL || *str == '\0')
1015 return;
1016
1017 size_t last = SeekLastAlphaChar(str, strlen(str));
1018 str[last] = '\0';
1019 }
1020
1021 /**********************************************************/
PointToNextToken(char * ptr)1022 char* PointToNextToken(char* ptr)
1023 {
1024 if(ptr != NULL)
1025 {
1026 while(*ptr != ' ')
1027 ptr++;
1028 while(*ptr == ' ')
1029 ptr++;
1030 }
1031 return(ptr);
1032 }
1033
1034 /**********************************************************
1035 *
1036 * char* GetTheCurrentToken(ptr):
1037 *
1038 * Return the current token (also CleanTailNoneAlphaChar)
1039 * which ptr points to and ptr will points to next token
1040 * after the routine return.
1041 *
1042 **********************************************************/
GetTheCurrentToken(char ** ptr)1043 char* GetTheCurrentToken(char** ptr)
1044 {
1045 char* retptr;
1046 char* bptr;
1047 char* str;
1048 Char ch;
1049
1050 bptr = retptr = *ptr;
1051 if(retptr == NULL || *retptr == '\0')
1052 return(NULL);
1053
1054 while(*retptr != '\0' && *retptr != ' ')
1055 retptr++;
1056
1057 ch = *retptr;
1058 *retptr = '\0';
1059 str = StringSave(bptr);
1060 *retptr = ch;
1061
1062 while(*retptr != '\0' && *retptr == ' ') /* skip blanks */
1063 retptr++;
1064 *ptr = retptr;
1065
1066 CleanTailNoneAlphaChar(str);
1067 return(str);
1068 }
1069
1070 /**********************************************************
1071 *
1072 * char* SrchTheChar(bptr, eptr, letter):
1073 *
1074 * Search The character letter.
1075 * Return NULL if not found; otherwise, return
1076 * a pointer points first occurrence The character.
1077 *
1078 **********************************************************/
SrchTheChar(char * bptr,char * eptr,Char letter)1079 char* SrchTheChar(char* bptr, char* eptr, Char letter)
1080 {
1081 while(bptr < eptr && *bptr != letter)
1082 bptr++;
1083
1084 if(bptr < eptr)
1085 return(bptr);
1086
1087 return(NULL);
1088 }
1089
1090 /**********************************************************
1091 *
1092 * char* SrchTheStr(bptr, eptr, leadstr):
1093 *
1094 * Search The leading string.
1095 * Return NULL if not found; otherwise, return
1096 * a pointer points first occurrence The leading string.
1097 *
1098 **********************************************************/
SrchTheStr(char * bptr,char * eptr,const char * leadstr)1099 char* SrchTheStr(char* bptr, char* eptr, const char *leadstr)
1100 {
1101 char* p;
1102 Char c;
1103
1104 c = *eptr;
1105 *eptr = '\0';
1106 p = StringStr(bptr, leadstr);
1107 *eptr = c;
1108 return(p);
1109 }
1110
1111 /**********************************************************/
CpSeqId(InfoBioseqPtr ibp,const objects::CSeq_id & id)1112 void CpSeqId(InfoBioseqPtr ibp, const objects::CSeq_id& id)
1113 {
1114 const objects::CTextseq_id* text_id = id.GetTextseq_Id();
1115 if (text_id != nullptr)
1116 {
1117 if (text_id->IsSetName())
1118 ibp->locus = StringSave(text_id->GetName().c_str());
1119
1120 CRef<objects::CSeq_id> new_id(new objects::CSeq_id);
1121 if (text_id->IsSetAccession())
1122 {
1123 ibp->acnum = StringSave(text_id->GetAccession().c_str());
1124
1125 CRef<objects::CTextseq_id> new_text_id(new objects::CTextseq_id);
1126 new_text_id->SetAccession(text_id->GetAccession());
1127 if (text_id->IsSetVersion())
1128 new_text_id->SetVersion(text_id->GetVersion());
1129
1130 SetTextId(id.Which(), *new_id, *new_text_id);
1131 }
1132 else
1133 {
1134 new_id->Assign(id);
1135 }
1136
1137 ibp->ids.push_back(new_id);
1138 }
1139 else {
1140 auto pId = Ref(new CSeq_id());
1141 pId->Assign(id);
1142 ibp->ids.push_back(move(pId));
1143 }
1144 }
1145
1146 /**********************************************************/
InfoBioseqFree(InfoBioseqPtr ibp)1147 void InfoBioseqFree(InfoBioseqPtr ibp)
1148 {
1149 if (!ibp->ids.empty())
1150 ibp->ids.clear();
1151
1152 if(ibp->locus != NULL)
1153 {
1154 MemFree(ibp->locus);
1155 ibp->locus = NULL;
1156 }
1157
1158 if(ibp->acnum != NULL)
1159 {
1160 MemFree(ibp->acnum);
1161 ibp->acnum = NULL;
1162 }
1163 }
1164
1165 /**********************************************************
1166 *
1167 * CRef<objects::CDate_std> get_full_date(s, is_ref, source):
1168 *
1169 * Get year, month, day and return CRef<objects::CDate_std>.
1170 *
1171 **********************************************************/
get_full_date(const Char * s,bool is_ref,Parser::ESource source)1172 CRef<objects::CDate_std> get_full_date(const Char* s, bool is_ref, Parser::ESource source)
1173 {
1174 static const char *months[] = { "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
1175 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC" };
1176
1177 int day = 0;
1178 int month = 0;
1179 int year;
1180 int cal;
1181 Char msg[11];
1182 const Char* p;
1183
1184 CRef<objects::CDate_std> date;
1185
1186 if (s == NULL || *s == '\0')
1187 return date;
1188
1189 if (IS_DIGIT(*s) != 0)
1190 {
1191 day = atoi(s);
1192 s += 3;
1193 }
1194
1195 int num_of_months = sizeof(months) / sizeof(months[0]);
1196 for (cal = 0; cal < num_of_months; cal++)
1197 {
1198 if (StringNICmp(s, months[cal], 3) != 0)
1199 continue;
1200 month = cal + 1;
1201 break;
1202 }
1203
1204 if (cal == num_of_months)
1205 {
1206 StringNCpy(msg, s, 10);
1207 msg[10] = '\0';
1208 if (is_ref)
1209 ErrPostEx(SEV_WARNING, ERR_REFERENCE_IllegalDate,
1210 "Unrecognized month: %s", msg);
1211 else
1212 ErrPostEx(SEV_WARNING, ERR_DATE_IllegalDate,
1213 "Unrecognized month: %s", msg);
1214 return date;
1215 }
1216 p = s + 4;
1217
1218 date = new objects::CDate_std;
1219 year = atoi(p);
1220 if ((StringNCmp(p, "19", 2) == 0 || StringNCmp(p, "20", 2) == 0 ||
1221 StringNCmp(p, "20", 2) == 0) &&
1222 p[2] >= '0' && p[2] <= '9' && p[3] >= '0' && p[3] <= '9')
1223 {
1224 CTime cur_time(CTime::eCurrent);
1225 objects::CDate_std cur(cur_time);
1226 objects::CDate_std::TYear cur_year = cur.GetYear();
1227
1228 if (year < 1900 || year > cur_year)
1229 {
1230 if (is_ref)
1231 ErrPostEx(SEV_ERROR, ERR_REFERENCE_IllegalDate,
1232 "Illegal year: %d, current year: %d", year, cur_year);
1233 else
1234 {
1235 if (source != Parser::ESource::SPROT || year - cur_year > 1)
1236 ErrPostEx(SEV_WARNING, ERR_DATE_IllegalDate,
1237 "Illegal year: %d, current year: %d", year, cur_year);
1238 }
1239 }
1240
1241 date->SetYear(year);
1242 }
1243 else
1244 {
1245 if (year < 70)
1246 year += 2000;
1247 else
1248 year += 1900;
1249 date->SetYear(year);
1250 }
1251
1252 date->SetMonth(month);
1253 date->SetDay(day);
1254
1255 return date;
1256 }
1257
1258 /**********************************************************
1259 *
1260 * Int2 SrchKeyword(ptr, kwl):
1261 *
1262 * Compare first kwl.len byte in ptr to kwl.str.
1263 * Return the position of keyword block array;
1264 * return unknow keyword, UNKW, if not found.
1265 *
1266 * 3-25-93
1267 *
1268 **********************************************************/
SrchKeyword(char * ptr,KwordBlk kwl[])1269 Int2 SrchKeyword(char* ptr, KwordBlk kwl[])
1270 {
1271 Int2 i;
1272
1273 for(i = 0; kwl[i].str != NULL; i++)
1274 if(StringNCmp(ptr, kwl[i].str, kwl[i].len) == 0)
1275 break;
1276
1277 if(kwl[i].str == NULL)
1278 return(ParFlat_UNKW);
1279 return(i);
1280 }
1281
1282 /**********************************************************/
CheckLineType(char * ptr,Int4 line,KwordBlk kwl[],bool after_origin)1283 bool CheckLineType(char* ptr, Int4 line, KwordBlk kwl[], bool after_origin)
1284 {
1285 char* p;
1286 Char msg[51];
1287 Int2 i;
1288
1289 if(after_origin)
1290 {
1291 for(p = ptr; *p >= '0' && *p <= '9';)
1292 p++;
1293 if(*p == ' ')
1294 return true;
1295 }
1296
1297 for(i = 0; kwl[i].str != NULL; i++)
1298 if(StringNCmp(ptr, kwl[i].str, kwl[i].len) == 0)
1299 break;
1300 if(kwl[i].str != NULL)
1301 return true;
1302
1303 StringNCpy(msg, StringSave(ptr), 50);
1304 msg[50] = '\0';
1305 p = StringChr(msg, '\n');
1306 if(p != NULL)
1307 *p = '\0';
1308 ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
1309 "Unknown linetype \"%s\". Line number %d.", msg, line);
1310 if(p != NULL)
1311 *p = '\n';
1312
1313 return false;
1314 }
1315
1316 /**********************************************************
1317 *
1318 * char* SrchNodeType(entry, type, len):
1319 *
1320 * Return a memory location of the node which has
1321 * the "type".
1322 *
1323 **********************************************************/
SrchNodeType(DataBlkPtr entry,Int4 type,size_t * len)1324 char* SrchNodeType(DataBlkPtr entry, Int4 type, size_t* len)
1325 {
1326 DataBlkPtr temp;
1327
1328 temp = TrackNodeType(entry, (Int2) type);
1329 if(temp != NULL)
1330 {
1331 *len = temp->len;
1332 return(temp->offset);
1333 }
1334
1335 *len = 0;
1336 return(NULL);
1337 }
1338
1339 /**********************************************************
1340 *
1341 * DataBlkPtr TrackNodeType(entry, type):
1342 *
1343 * Return a pointer points to the Node which has
1344 * the "type".
1345 *
1346 **********************************************************/
TrackNodeType(DataBlkPtr entry,Int2 type)1347 DataBlkPtr TrackNodeType(DataBlkPtr entry, Int2 type)
1348 {
1349 DataBlkPtr temp;
1350 EntryBlkPtr ebp;
1351
1352 ebp = (EntryBlkPtr) entry->data;
1353 temp = (DataBlkPtr) ebp->chain;
1354 while(temp != NULL && temp->type != type)
1355 temp = temp->next;
1356
1357 return(temp);
1358 }
1359
1360 /**********************************************************/
fta_tpa_keywords_check(const TKeywordList & kwds)1361 bool fta_tpa_keywords_check(const TKeywordList& kwds)
1362 {
1363 const char* b[4];
1364
1365 bool kwd_tpa = false;
1366 bool kwd_party = false;
1367 bool kwd_inf = false;
1368 bool kwd_exp = false;
1369 bool kwd_asm = false;
1370 bool kwd_spedb = false;
1371 bool ret = true;
1372
1373 Int4 j;
1374 Int2 i;
1375
1376 if(kwds.empty())
1377 return true;
1378
1379 size_t len = 0;
1380 j = 0;
1381 ITERATE(TKeywordList, key, kwds)
1382 {
1383 if(key->empty())
1384 continue;
1385
1386 const char* p = key->c_str();
1387 i = MatchArrayIString(ParFlat_TPA_kw_array, p);
1388 if(i == 0)
1389 kwd_tpa = true;
1390 else if(i == 1 || i == 2)
1391 kwd_party = true;
1392 else if(i == 3)
1393 kwd_inf = true;
1394 else if(i == 4)
1395 kwd_exp = true;
1396 else if(i == 5 || i == 6)
1397 kwd_asm = true;
1398 else if(i == 7)
1399 kwd_spedb = true;
1400 else if (NStr::EqualNocase(p, 0, 3, "TPA"))
1401 {
1402 if(p[3] == ':')
1403 {
1404 ErrPostEx(SEV_REJECT, ERR_KEYWORD_InvalidTPATier,
1405 "Keyword \"%s\" is not a valid TPA-tier keyword.",
1406 p);
1407 ret = false;
1408 }
1409 else if(p[3] != '\0' && p[4] != '\0')
1410 {
1411 ErrPostEx(SEV_WARNING, ERR_KEYWORD_UnexpectedTPA,
1412 "Keyword \"%s\" looks like it might be TPA-related, but it is not a recognized TPA keyword.",
1413 p);
1414 }
1415 }
1416 if(i > 2 && i < 8 && j < 4)
1417 {
1418 b[j] = p;
1419 ++j;
1420 len += key->size() + 1;
1421 }
1422 }
1423
1424 if(kwd_tpa && !kwd_party)
1425 {
1426 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords,
1427 "This TPA-record should have keyword \"Third Party Annotation\" or \"Third Party Data\" in addition to \"TPA\".");
1428 ret = false;
1429 }
1430 else if(!kwd_tpa && kwd_party)
1431 {
1432 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords,
1433 "This TPA-record should have keyword \"TPA\" in addition to \"Third Party Annotation\" or \"Third Party Data\".");
1434 ret = false;
1435 }
1436 if(!kwd_tpa && (kwd_inf || kwd_exp))
1437 {
1438 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTPAKeywords,
1439 "This TPA-record should have keyword \"TPA\" in addition to its TPA-tier keyword.");
1440 ret = false;
1441 }
1442 else if(kwd_tpa && kwd_inf == false && kwd_exp == false &&
1443 kwd_asm == false && kwd_spedb == false)
1444 {
1445 ErrPostEx(SEV_ERROR, ERR_KEYWORD_MissingTPATier,
1446 "This TPA record lacks a keyword to indicate which tier it belongs to: experimental, inferential, reassembly or specialist_db.");
1447 }
1448 if(j > 1)
1449 {
1450 std::string buf;
1451 for(i = 0; i < j; i++)
1452 {
1453 if(i > 0)
1454 buf += ';';
1455 buf += b[i];
1456 }
1457 ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingTPATiers,
1458 "Keywords for multiple TPA tiers exist on this record: \"%s\". A TPA record can only be in one tier.",
1459 buf.c_str());
1460 ret = false;
1461 }
1462
1463 return(ret);
1464 }
1465
1466 /**********************************************************/
fta_tsa_keywords_check(const TKeywordList & kwds,Parser::ESource source)1467 bool fta_tsa_keywords_check(const TKeywordList& kwds, Parser::ESource source)
1468 {
1469 bool kwd_tsa = false;
1470 bool kwd_assembly = false;
1471 bool ret = true;
1472 Int2 i;
1473
1474 if(kwds.empty())
1475 return true;
1476
1477 ITERATE(TKeywordList, key, kwds)
1478 {
1479 if(key->empty())
1480 continue;
1481 i = MatchArrayIString(ParFlat_TSA_kw_array, key->c_str());
1482 if(i == 0)
1483 kwd_tsa = true;
1484 else if(i == 1)
1485 kwd_assembly = true;
1486 else if(source == Parser::ESource::EMBL &&
1487 NStr::EqualNocase(*key, "Transcript Shotgun Assembly"))
1488 kwd_assembly = true;
1489 }
1490
1491 if(kwd_tsa && !kwd_assembly)
1492 {
1493 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords,
1494 "This TSA-record should have keyword \"Transcriptome Shotgun Assembly\" in addition to \"TSA\".");
1495 ret = false;
1496 }
1497 else if(!kwd_tsa && kwd_assembly)
1498 {
1499 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTSAKeywords,
1500 "This TSA-record should have keyword \"TSA\" in addition to \"Transcriptome Shotgun Assembly\".");
1501 ret = false;
1502 }
1503 return(ret);
1504 }
1505
1506 /**********************************************************/
fta_tls_keywords_check(const TKeywordList & kwds,Parser::ESource source)1507 bool fta_tls_keywords_check(const TKeywordList& kwds, Parser::ESource source)
1508 {
1509 bool kwd_tls = false;
1510 bool kwd_study = false;
1511 bool ret = true;
1512 Int2 i;
1513
1514 if(kwds.empty())
1515 return true;
1516
1517 ITERATE(TKeywordList, key, kwds)
1518 {
1519 if(key->empty())
1520 continue;
1521 i = MatchArrayIString(ParFlat_TLS_kw_array, key->c_str());
1522 if(i == 0)
1523 kwd_tls = true;
1524 else if(i == 1)
1525 kwd_study = true;
1526 else if(source == Parser::ESource::EMBL &&
1527 NStr::EqualNocase(*key, "Targeted Locus Study"))
1528 kwd_study = true;
1529 }
1530
1531 if(kwd_tls && !kwd_study)
1532 {
1533 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords,
1534 "This TLS-record should have keyword \"Targeted Locus Study\" in addition to \"TLS\".");
1535 ret = false;
1536 }
1537 else if(!kwd_tls && kwd_study)
1538 {
1539 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingTLSKeywords,
1540 "This TLS-record should have keyword \"TLS\" in addition to \"Targeted Locus Study\".");
1541 ret = false;
1542 }
1543 return(ret);
1544 }
1545
1546 /**********************************************************/
fta_is_tpa_keyword(const char * str)1547 bool fta_is_tpa_keyword(const char* str)
1548 {
1549 if(str == NULL || *str == '\0' || MatchArrayIString(ParFlat_TPA_kw_array, str) < 0)
1550 return false;
1551
1552 return true;
1553 }
1554
1555 /**********************************************************/
fta_is_tsa_keyword(char * str)1556 bool fta_is_tsa_keyword(char* str)
1557 {
1558 if(str == NULL || *str == '\0' || MatchArrayIString(ParFlat_TSA_kw_array, str) < 0)
1559 return false;
1560 return true;
1561 }
1562
1563 /**********************************************************/
fta_is_tls_keyword(char * str)1564 bool fta_is_tls_keyword(char* str)
1565 {
1566 if(str == NULL || *str == '\0' || MatchArrayIString(ParFlat_TLS_kw_array, str) < 0)
1567 return false;
1568 return true;
1569 }
1570
1571 /**********************************************************/
fta_keywords_check(const char * str,bool * estk,bool * stsk,bool * gssk,bool * htck,bool * flik,bool * wgsk,bool * tpak,bool * envk,bool * mgak,bool * tsak,bool * tlsk)1572 void fta_keywords_check(const char* str, bool* estk, bool* stsk, bool* gssk,
1573 bool* htck, bool* flik, bool* wgsk, bool* tpak,
1574 bool* envk, bool* mgak, bool* tsak, bool* tlsk)
1575 {
1576 if(estk != NULL && MatchArrayString(ParFlat_EST_kw_array, str) != -1)
1577 *estk = true;
1578
1579 if(stsk != NULL && MatchArrayString(ParFlat_STS_kw_array, str) != -1)
1580 *stsk = true;
1581
1582 if(gssk != NULL && MatchArrayString(ParFlat_GSS_kw_array, str) != -1)
1583 *gssk = true;
1584
1585 if(htck != NULL && MatchArrayString(ParFlat_HTC_kw_array, str) != -1)
1586 *htck = true;
1587
1588 if(flik != NULL && MatchArrayString(ParFlat_FLI_kw_array, str) != -1)
1589 *flik = true;
1590
1591 if(wgsk != NULL && MatchArrayString(ParFlat_WGS_kw_array, str) != -1)
1592 *wgsk = true;
1593
1594 if(tpak != NULL && MatchArrayString(ParFlat_TPA_kw_array, str) != -1)
1595 *tpak = true;
1596
1597 if(envk != NULL && MatchArrayString(ParFlat_ENV_kw_array, str) != -1)
1598 *envk = true;
1599
1600 if(mgak != NULL && MatchArrayString(ParFlat_MGA_kw_array, str) != -1)
1601 *mgak = true;
1602
1603 if(tsak != NULL && MatchArrayString(ParFlat_TSA_kw_array, str) != -1)
1604 *tsak = true;
1605
1606 if(tlsk != NULL && MatchArrayString(ParFlat_TLS_kw_array, str) != -1)
1607 *tlsk = true;
1608 }
1609
1610 /**********************************************************/
fta_remove_keywords(Uint1 tech,TKeywordList & kwds)1611 void fta_remove_keywords(Uint1 tech, TKeywordList& kwds)
1612 {
1613 const char **b;
1614
1615 if(kwds.empty())
1616 return;
1617
1618 if (tech == objects::CMolInfo::eTech_est)
1619 b = ParFlat_EST_kw_array;
1620 else if (tech == objects::CMolInfo::eTech_sts)
1621 b = ParFlat_STS_kw_array;
1622 else if (tech == objects::CMolInfo::eTech_survey)
1623 b = ParFlat_GSS_kw_array;
1624 else if (tech == objects::CMolInfo::eTech_htc)
1625 b = ParFlat_HTC_kw_array;
1626 else if (tech == objects::CMolInfo::eTech_fli_cdna)
1627 b = ParFlat_FLI_kw_array;
1628 else if (tech == objects::CMolInfo::eTech_wgs)
1629 b = ParFlat_WGS_kw_array;
1630 else
1631 return;
1632
1633 for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1634 {
1635 if (key->empty() || MatchArrayString(b, key->c_str()) != -1)
1636 {
1637 key = kwds.erase(key);
1638 }
1639 else
1640 ++key;
1641 }
1642 }
1643
1644 /**********************************************************/
fta_remove_tpa_keywords(TKeywordList & kwds)1645 void fta_remove_tpa_keywords(TKeywordList& kwds)
1646 {
1647 if (kwds.empty())
1648 return;
1649
1650 for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1651 {
1652 if (key->empty() || MatchArrayIString(ParFlat_TPA_kw_array_to_remove, key->c_str()) != -1)
1653 {
1654 key = kwds.erase(key);
1655 }
1656 else
1657 ++key;
1658 }
1659 }
1660
1661 /**********************************************************/
fta_remove_tsa_keywords(TKeywordList & kwds,Parser::ESource source)1662 void fta_remove_tsa_keywords(TKeywordList& kwds, Parser::ESource source)
1663 {
1664 if (kwds.empty())
1665 return;
1666
1667 for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1668 {
1669 if (key->empty() || MatchArrayIString(ParFlat_TSA_kw_array, key->c_str()) != -1 ||
1670 (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Transcript Shotgun Assembly")))
1671 {
1672 key = kwds.erase(key);
1673 }
1674 else
1675 ++key;
1676 }
1677 }
1678
1679 /**********************************************************/
fta_remove_tls_keywords(TKeywordList & kwds,Parser::ESource source)1680 void fta_remove_tls_keywords(TKeywordList& kwds, Parser::ESource source)
1681 {
1682 if (kwds.empty())
1683 return;
1684
1685 for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1686 {
1687 if (key->empty() || MatchArrayIString(ParFlat_TLS_kw_array, key->c_str()) != -1 ||
1688 (source == Parser::ESource::EMBL && NStr::EqualNocase(*key, "Targeted Locus Study")))
1689 {
1690 key = kwds.erase(key);
1691 }
1692 else
1693 ++key;
1694 }
1695 }
1696
1697 /**********************************************************/
fta_remove_env_keywords(TKeywordList & kwds)1698 void fta_remove_env_keywords(TKeywordList& kwds)
1699 {
1700 if (kwds.empty())
1701 return;
1702
1703 for (TKeywordList::iterator key = kwds.begin(); key != kwds.end();)
1704 {
1705 if (key->empty() || MatchArrayIString(ParFlat_ENV_kw_array, key->c_str()) != -1)
1706 {
1707 key = kwds.erase(key);
1708 }
1709 else
1710 ++key;
1711 }
1712 }
1713
1714 /**********************************************************/
check_est_sts_gss_tpa_kwds(ValNodePtr kwds,size_t len,IndexblkPtr entry,bool tpa_check,bool & specialist_db,bool & inferential,bool & experimental,bool & assembly)1715 void check_est_sts_gss_tpa_kwds(ValNodePtr kwds, size_t len, IndexblkPtr entry,
1716 bool tpa_check, bool &specialist_db,
1717 bool &inferential, bool &experimental,
1718 bool &assembly)
1719 {
1720 char* line;
1721 char* p;
1722 char* q;
1723
1724 if(kwds == NULL || kwds->data.ptrvalue == NULL || len < 1)
1725 return;
1726
1727 line = (char*) MemNew(len + 1);
1728 line[0] = '\0';
1729 for(; kwds != NULL; kwds = kwds->next)
1730 {
1731 StringCat(line, (const char *) kwds->data.ptrvalue);
1732 }
1733 for(p = line; *p != '\0'; p++)
1734 if(*p == '\n' || *p == '\t')
1735 *p = ' ';
1736 for(p = line; *p == ' ' || *p == '.' || *p == ';';)
1737 p++;
1738 if(*p == '\0')
1739 {
1740 MemFree(line);
1741 return;
1742 }
1743 for(q = p; *q != '\0';)
1744 q++;
1745 for(q--; *q == ' ' || *q == '.' || *q == ';'; q--)
1746 *q = '\0';
1747 for(q = p, p = line; *q != '\0';)
1748 {
1749 if(*q != ' ' && *q != ';')
1750 {
1751 *p++ = *q++;
1752 continue;
1753 }
1754 if(*q == ' ')
1755 {
1756 for(q++; *q == ' ';)
1757 q++;
1758 if(*q != ';')
1759 *p++ = ' ';
1760 }
1761 if(*q == ';')
1762 {
1763 *p++ = *q++;
1764 while(*q == ' ' || *q == ';')
1765 q++;
1766 }
1767 }
1768 *p++ = ';';
1769 *p = '\0';
1770 for(p = line;; p = q + 1)
1771 {
1772 q = StringChr(p, ';');
1773 if(q == NULL)
1774 break;
1775 *q = '\0';
1776 fta_keywords_check(p, &entry->EST, &entry->STS, &entry->GSS,
1777 &entry->HTC, NULL, NULL,
1778 (tpa_check ? &entry->is_tpa : NULL),
1779 NULL, NULL, NULL, NULL);
1780 if(NStr::EqualNocase(p, "TPA:specialist_db") ||
1781 NStr::EqualNocase(p, "TPA:assembly"))
1782 {
1783 specialist_db = true;
1784 if(NStr::EqualNocase(p, "TPA:assembly"))
1785 assembly = true;
1786 }
1787 else if(NStr::EqualNocase(p, "TPA:inferential"))
1788 inferential = true;
1789 else if(NStr::EqualNocase(p, "TPA:experimental"))
1790 experimental = true;
1791 }
1792 MemFree(line);
1793 }
1794
1795 /**********************************************************/
fta_operon_free(FTAOperonPtr fop)1796 void fta_operon_free(FTAOperonPtr fop)
1797 {
1798 FTAOperonPtr fopnext;
1799
1800 for(; fop != NULL; fop = fopnext)
1801 {
1802 fopnext = fop->next;
1803 if(fop->strloc != NULL)
1804 MemFree(fop->strloc);
1805 delete fop;
1806 }
1807 }
1808
1809 /**********************************************************/
ConstructValNode(ValNodePtr head,Uint1 choice,void * data)1810 ValNodePtr ConstructValNode(ValNodePtr head, Uint1 choice, void* data)
1811 {
1812 ValNodePtr res;
1813
1814 res = ValNodeNew(head);
1815 res->choice = choice;
1816 res->data.ptrvalue = data;
1817 res->next = NULL;
1818 return(res);
1819 }
1820
1821 /**********************************************************/
ConstructValNodeInt(ValNodePtr head,Uint1 choice,Int4 data)1822 ValNodePtr ConstructValNodeInt(ValNodePtr head, Uint1 choice, Int4 data)
1823 {
1824 ValNodePtr res;
1825
1826 res = ValNodeNew(head);
1827 res->choice = choice;
1828 res->data.intvalue = data;
1829 res->next = NULL;
1830 return(res);
1831 }
1832
1833 /**********************************************************/
fta_check_mga_keywords(objects::CMolInfo & mol_info,const TKeywordList & kwds)1834 bool fta_check_mga_keywords(objects::CMolInfo& mol_info, const TKeywordList& kwds)
1835 {
1836 bool is_cage;
1837 bool is_sage;
1838
1839 TKeywordList::const_iterator key_it = kwds.end();
1840
1841 bool got = false;
1842 if (!kwds.empty() && NStr::EqualNocase(kwds.front(), "MGA"))
1843 {
1844 ITERATE(TKeywordList, key, kwds)
1845 {
1846 if(MatchArrayIString(ParFlat_MGA_more_kw_array,
1847 key->c_str()) < 0)
1848 continue;
1849 got = true;
1850 key_it = key;
1851 break;
1852 }
1853 }
1854
1855 if(!got)
1856 {
1857 ErrPostEx(SEV_REJECT, ERR_KEYWORD_MissingMGAKeywords,
1858 "This is apparently a CAGE record, but it lacks the required keywords. Entry dropped.");
1859 return false;
1860 }
1861
1862 if (!mol_info.IsSetTechexp() || !kwds.empty() ||
1863 mol_info.GetTechexp() != "cage")
1864 return true;
1865
1866 for (is_sage = false, is_cage = false; key_it != kwds.end(); ++key_it)
1867 {
1868 const char* p = key_it->c_str();
1869
1870 if (NStr::EqualNocase(p, "5'-SAGE"))
1871 is_sage = true;
1872 else if (NStr::EqualNocase(p, "CAGE (Cap Analysis Gene Expression)"))
1873 is_cage = true;
1874 }
1875
1876 if(is_sage)
1877 {
1878 if(is_cage)
1879 {
1880 ErrPostEx(SEV_REJECT, ERR_KEYWORD_ConflictingMGAKeywords,
1881 "This MGA record contains more than one of the special keywords indicating different techniques.");
1882 return false;
1883 }
1884 mol_info.SetTechexp("5'-sage");
1885 }
1886
1887 return true;
1888 }
1889
1890 /**********************************************************/
fta_StringCpy(char * dst,char * src)1891 void fta_StringCpy(char* dst, char* src)
1892 {
1893 char* p;
1894 char* q;
1895
1896 for(q = dst, p = src; *p != '\0';)
1897 *q++ = *p++;
1898 *q = '\0';
1899 }
1900
1901 /**********************************************************/
SetTextId(Uint1 seqtype,objects::CSeq_id & seqId,objects::CTextseq_id & textId)1902 bool SetTextId(Uint1 seqtype, objects::CSeq_id& seqId, objects::CTextseq_id& textId)
1903 {
1904 bool wasSet = true;
1905
1906 switch (seqtype)
1907 {
1908 case objects::CSeq_id::e_Genbank:
1909 seqId.SetGenbank(textId);
1910 break;
1911 case objects::CSeq_id::e_Embl:
1912 seqId.SetEmbl(textId);
1913 break;
1914 case objects::CSeq_id::e_Pir:
1915 seqId.SetPir(textId);
1916 break;
1917 case objects::CSeq_id::e_Swissprot:
1918 seqId.SetSwissprot(textId);
1919 break;
1920 case objects::CSeq_id::e_Other:
1921 seqId.SetOther(textId);
1922 break;
1923 case objects::CSeq_id::e_Ddbj:
1924 seqId.SetDdbj(textId);
1925 break;
1926 case objects::CSeq_id::e_Prf:
1927 seqId.SetPrf(textId);
1928 break;
1929 case objects::CSeq_id::e_Pdb:
1930 {
1931 // TODO: test this branch
1932 objects::CPDB_seq_id pdbId;
1933 pdbId.SetChain_id(0);
1934 seqId.SetPdb(pdbId);
1935 }
1936 break;
1937 case objects::CSeq_id::e_Tpg:
1938 seqId.SetTpg(textId);
1939 break;
1940 case objects::CSeq_id::e_Tpe:
1941 seqId.SetTpe(textId);
1942 break;
1943 case objects::CSeq_id::e_Tpd:
1944 seqId.SetTpd(textId);
1945 break;
1946 case objects::CSeq_id::e_Gpipe:
1947 seqId.SetGpipe(textId);
1948 break;
1949 case objects::CSeq_id::e_Named_annot_track:
1950 seqId.SetNamed_annot_track(textId);
1951 break;
1952
1953 default:
1954 wasSet = false;
1955 }
1956
1957 return wasSet;
1958 }
1959
1960 /**********************************************************/
IsCancelled(const TKeywordList & keywords)1961 bool IsCancelled(const TKeywordList& keywords)
1962 {
1963 ITERATE(TKeywordList, key, keywords)
1964 {
1965 if (NStr::EqualNocase(*key, "HTGS_CANCELLED"))
1966 return true;
1967 }
1968
1969 return false;
1970 }
1971
1972 /**********************************************************/
HasHtg(const TKeywordList & keywords)1973 bool HasHtg(const TKeywordList& keywords)
1974 {
1975 ITERATE(TKeywordList, key, keywords)
1976 {
1977 if (*key == "HTG" || *key == "HTGS_PHASE0" ||
1978 *key == "HTGS_PHASE1" || *key == "HTGS_PHASE2" ||
1979 *key == "HTGS_PHASE3")
1980 {
1981 return true;
1982 }
1983 }
1984
1985 return false;
1986 }
1987
1988 /**********************************************************/
RemoveHtgPhase(TKeywordList & keywords)1989 void RemoveHtgPhase(TKeywordList& keywords)
1990 {
1991 for (TKeywordList::iterator key = keywords.begin(); key != keywords.end();)
1992 {
1993 const char* p = key->c_str();
1994 if (NStr::EqualNocase(p, 0, 10, "HTGS_PHASE") &&
1995 (p[10] == '0' || p[10] == '1' || p[10] == '2' ||
1996 p[10] == '3') && p[11] == '\0')
1997 {
1998 key = keywords.erase(key);
1999 }
2000 else
2001 ++key;
2002 }
2003 }
2004
2005 /**********************************************************/
HasHtc(const TKeywordList & keywords)2006 bool HasHtc(const TKeywordList& keywords)
2007 {
2008 ITERATE(TKeywordList, key, keywords)
2009 {
2010 if (NStr::EqualNocase(*key, "HTC"))
2011 {
2012 return true;
2013 }
2014 }
2015
2016 return false;
2017 }
2018
2019 END_NCBI_SCOPE
2020