1 /* indx_blk.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  indx_blk.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Common for all format functions.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include "index.h"
41 #include <objtools/flatfile/flatfile_parse_info.hpp>
42 
43 #include "ftaerr.hpp"
44 #include "indx_blk.h"
45 #include "indx_def.h"
46 #include "utilfun.h"
47 #include <map>
48 
49 #ifdef THIS_FILE
50 #    undef THIS_FILE
51 #endif
52 #define THIS_FILE "indx_blk.cpp"
53 
54 
55 BEGIN_NCBI_SCOPE
56 USING_SCOPE(objects);
57 static const char *XML_STRAND_array[] = {
58     "   ", "single", "double", "mixed", NULL
59 };
60 
61 static const char *XML_TPG_array[] = {
62     "   ", "Linear", "Circular", "Tandem", NULL
63 };
64 
65 static const char *ParFlat_STRAND_array[] = {
66     "   ", "ss-", "ds-", "ms-", NULL
67 };
68 
69 static const char *ParFlat_TPG_array[] = {
70     "         ", "Linear   ", "Circular ", "Tandem   ", NULL
71 };
72 
73 static const char *ParFlat_NA_array_DDBJ[] = {
74     "cDNA", NULL
75 };
76 
77 static const char *ParFlat_AA_array_DDBJ[] = {
78     "PRT", NULL
79 };
80 
81 static const char *ParFlat_NA_array[] = {
82     "    ", "NA", "DNA", "genomic DNA", "other DNA", "unassigned DNA", "RNA",
83     "mRNA", "rRNA", "tRNA", "uRNA", "scRNA", "snRNA", "snoRNA", "pre-RNA",
84     "pre-mRNA", "genomic RNA", "other RNA", "unassigned RNA", "cRNA",
85     "viral cRNA",  NULL
86 };
87 
88 static const char *ParFlat_DIV_array[] = {
89     "   ", "PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "RNA",
90     "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", "ORG", "GSS",
91     "HUM", "HTG", "CON", "HTC", "ENV", "TSA", NULL
92 };
93 
94 static const char *embl_accpref[] = {
95     "AJ", "AL", "AM", "AN", "AX", "BN", "BX", "CQ", "CR", "CS", "CT", "CU",
96     "FB", "FM", "FN", "FO", "FP", "FQ", "FR", "GM", "GN", "HA", "HB", "HC",
97     "HD", "HE", "HF", "HG", "HH", "HI", "JA", "JB", "JC", "JD", "JE", "LK",
98     "LL", "LM", "LN", "LO", "LP", "LQ", "LR", "LS", "LT", "MP", "MQ", "MR",
99     "MS", "OA", "OB", "OC", "OD", "OE", NULL
100 };
101 
102 static const char *lanl_accpref[] = {
103     "AD", NULL
104 };
105 
106 static const char *pir_accpref[] = {
107     "CC", NULL
108 };
109 
110 static const char *prf_accpref[] = {
111     "XX", NULL
112 };
113 
114 static const char *sprot_accpref[] = {
115     "DD", NULL
116 };
117 
118 static const char *ddbj_accpref[] = {
119     "AB", "AG", "AK", "AP", "AT", "AU", "AV", "BA", "BB", "BD", "BJ", "BP",
120     "BR", "BS", "BW", "BY", "CI", "CJ", "DA", "DB", "DC", "DD", "DE", "DF",
121     "DG", "DH", "DI", "DJ", "DK", "DL", "DM", "FS", "FT", "FU", "FV", "FW",
122     "FX", "FY", "FZ", "GA", "GB", "HT", "HU", "HV", "HW", "HX", "HY", "HZ",
123     "LA", "LB", "LC", "LD", "LE", "LF", "LG", "LH", "LI", "LJ", "LU", "LV",
124     "LX", "LY", "LZ", "MA", "MB", "MC", "MD", "ME", "OF", "OG", NULL
125 };
126 
127 static const char *ncbi_accpref[] = {
128     "AA", "AC", "AD", "AE", "AF", "AH", "AI", "AQ", "AR", "AS", "AW", "AY",
129     "AZ", "BC", "BE", "BF", "BG", "BH", "BI", "BK", "BL", "BM", "BQ", "BT",
130     "BU", "BV", "BZ", "CA", "CB", "CC", "CD", "CE", "CF", "CG", "CH", "CK",
131     "CL", "CM", "CN", "CO", "CP", "CV", "CW", "CX", "CY", "CZ", "DN", "DP",
132     "DQ", "DR", "DS", "DT", "DU", "DV", "DW", "DX", "DY", "DZ", "EA", "EB",
133     "EC", "ED", "EE", "EF", "EG", "EH", "EI", "EJ", "EK", "EL", "EM", "EN",
134     "EP", "EQ", "ER", "ES", "ET", "EU", "EV", "EW", "EX", "EY", "EZ", "FA",
135     "FC", "FD", "FE", "FF", "FG", "FH", "FI", "FJ", "FK", "FL", "GC", "GD",
136     "GE", "GF", "GG", "GH", "GJ", "GK", "GL", "GO", "GP", "GQ", "GR", "GS",
137     "GT", "GU", "GV", "GW", "GX", "GY", "GZ", "HJ", "HK", "HL", "HM", "HN",
138     "HO", "HP", "HQ", "HR", "HS", "JF", "JG", "JH", "JI", "JJ", "JK", "JL",
139     "JM", "JN", "JO", "JP", "JQ", "JR", "JS", "JT", "JU", "JV", "JW", "JX",
140     "JY", "JZ", "KA", "KB", "KC", "KD", "KE", "KF", "KG", "KH", "KI", "KJ",
141     "KK", "KL", "KM", "KN", "KO", "KP", "KQ", "KR", "KS", "KT", "KU", "KV",
142     "KX", "KY", "KZ", "MF", "MG", "MH", "MI", "MJ", "MK", "ML", "MM", "MN",
143     "MO", "MT", "MU", NULL
144 };
145 
146 static const char *refseq_accpref[] = {
147     "NC_", "NG_", "NM_", "NP_", "NR_", "NT_", "NW_", "XM_", "XP_", "XR_",
148     "NZ_", NULL
149 };
150 
151 static const char *refseq_prot_accpref[] = {
152     "AP_", "NP_", "WP_", "XP_", "YP_", "ZP_", NULL
153 };
154 
155 static const char *acc_tsa_allowed[] = {
156     "AF", "AY", "DQ", "EF", "EU", "FJ", "GQ", "HQ", "JF", "JN", "JQ", "JX",
157     "KC", "KF", "KJ", "KM", "KP", "KR", "KT", "KU", "KX", "KY", "MF", "MG",
158     "MH", "MK", "MN", "MT", NULL
159 };
160 
161 static const char *ncbi_tpa_accpref[] = {
162     "BK", "BL", "GJ", "GK", NULL
163 };
164 
165 static const char *ddbj_tpa_accpref[] = {
166     "BR", "HT", "HU", NULL
167 };
168 
169 static const char *ncbi_wgs_accpref[] = {
170     "GJ", "GK", NULL
171 };
172 
173 static const char *ddbj_wgs_accpref[] = {
174     "HT", "HU", NULL
175 };
176 
177 static const set<string> k_WgsScaffoldPrefix =
178     {"CH", "CT", "CU", "DF", "DG", "DS",
179      "EM", "EN", "EP", "EQ", "FA", "FM",
180      "GG", "GJ", "GK", "GL", "HT", "HU",
181      "JH", "KB", "KD", "KE", "KI", "KK",
182      "KL", "KN", "KQ", "KV", "KZ", "LD",
183      "ML", "MU"};
184 
185 //static const char *wgs_scfld_pref[] =
186 
187 static const char *source[11] = {
188     "unknown",
189     "EMBL",
190     "GENBANK",
191     "PIR",
192     "Swiss-Prot",
193     "NCBI",
194     "GSDB",
195     "DDBJ",
196     "FlyBase",
197     "RefSeq",
198     "unknown"
199 };
200 
201 
202 static const map<Parser::ESource, string> sourceNames =  {
203     {Parser::ESource::unknown, "unknown"},
204     {Parser::ESource::EMBL, "EMBL"},
205     {Parser::ESource::GenBank, "GENBANK"},
206     {Parser::ESource::PIR, "PIR"},
207     {Parser::ESource::SPROT, "Swiss-Prot"},
208     {Parser::ESource::NCBI, "NCBI"},
209     {Parser::ESource::LANL, "GSDB"},
210     {Parser::ESource::Flybase, "FlyBase"},
211     {Parser::ESource::Refseq, "RefSeq"},
212     {Parser::ESource::PRF, "unknown"}};
213 
214 static const char *month_name[] = {
215     "Ill", "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
216     "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", NULL
217 };
218 
219 static const char *ParFlat_RESIDUE_STR[] = {
220     "bp", "bp.", "bp,", "AA", "AA.", "AA,", NULL
221 };
222 
223 static const char *ValidMolTypes[] = {
224     "genomic DNA",
225     "genomic RNA",
226     "mRNA",
227     "tRNA",
228     "rRNA",
229     "snoRNA",
230     "snRNA",
231     "scRNA",
232     "pre-RNA",
233     "pre-mRNA",
234     "other RNA",
235     "other DNA",
236     "transcribed RNA",
237     "unassigned RNA",
238     "unassigned DNA",
239     "viral cRNA",
240     NULL
241 };
242 
243 // functions below are implemented in different source files
244 bool EmblIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len));
245 bool GenBankIndex(ParserPtr pp);
246 bool SprotIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char* offset, Int4 len));
247 bool PrfIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char* offset, Int4 len));
248 bool PirIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char* offset, Int4 len));
249 bool XMLIndex(ParserPtr pp);
250 
251 /**********************************************************
252  *
253  *   static char* GetResidue(stoken):
254  *
255  *      Return a string pointer in the "stoken" which its
256  *   next token string match any one string in the
257  *   ParFlat_RESIDUE_STR but ignore case for all alphabetic
258  *   characters; return NULL if not found.
259  *
260  *                                              3-25-93
261  *
262  **********************************************************/
GetResidue(TokenStatBlkPtr stoken)263 static char* GetResidue(TokenStatBlkPtr stoken)
264 {
265     TokenBlkPtr  sptr;
266     TokenBlkPtr  ptr;
267     const char   **b;
268     Int2         i;
269 
270     ptr = stoken->list;
271     sptr = stoken->list->next;
272     for(i = 1; i < stoken->num; i++, ptr = ptr->next, sptr = sptr->next)
273     {
274         for(b = ParFlat_RESIDUE_STR; *b != NULL; b++)
275             if(StringICmp(*b, sptr->str) == 0)
276                 return(ptr->str);
277     }
278 
279     return(NULL);
280 }
281 
282 /**********************************************************
283  *
284  *   bool XReadFile(fp, finfo):
285  *
286  *      Record position and line # of the file, loop stop
287  *   when got a none blank line.
288  *      Return TRUE if END_OF_FILE.
289  *
290  *                                              2-26-93
291  *
292  **********************************************************/
XReadFile(FILE * fp,FinfoBlkPtr finfo)293 bool XReadFile(FILE* fp, FinfoBlkPtr finfo)
294 {
295     bool end_of_file = false;
296 
297     StringCpy(finfo->str, "\n");
298     while(!end_of_file && StringNCmp(finfo->str, "\n", 1) == 0)
299     {
300         finfo->pos = (size_t) ftell(fp);
301         if (fgets(finfo->str, sizeof(finfo->str) - 1, fp) == NULL)
302             end_of_file = true;
303         else
304             ++(finfo->line);
305     }
306 
307     auto n = strlen(finfo->str);
308     while (n) {
309         n--;
310         if (finfo->str[n] != '\n' && finfo->str[n] != '\r') {
311             break;
312         }
313         finfo->str[n] = 0;
314     }
315 
316     return(end_of_file);
317 }
318 
319 /**********************************************************/
FileGetsBuf(char * res,Int4 size,FileBuf & fbuf)320 static Int2 FileGetsBuf(char* res, Int4 size, FileBuf& fbuf)
321 {
322     const char* p = nullptr;
323     char* q;
324     Int4    l;
325     Int4    i;
326 
327     if(*fbuf.current == '\0')
328         return(0);
329 
330     l = size - 1;
331     for(p = fbuf.current, q = res, i = 0; i < l; i++, p++)
332     {
333         *q++ = *p;
334         if(*p == '\n' || *p == '\r')
335         {
336             p++;
337             break;
338         }
339     }
340 
341     *q = '\0';
342     fbuf.current = p;
343     return(1);
344 }
345 
346 /**********************************************************/
XReadFileBuf(FileBuf & fbuf,FinfoBlkPtr finfo)347 bool XReadFileBuf(FileBuf& fbuf, FinfoBlkPtr finfo)
348 {
349     bool end_of_file = false;
350 
351     StringCpy(finfo->str, "\n");
352     while(!end_of_file && StringNCmp(finfo->str, "\n", 1) == 0)
353     {
354         finfo->pos = (size_t) (fbuf.current - fbuf.start);
355         if(FileGetsBuf(finfo->str, sizeof(finfo->str) - 1, fbuf) == 0)
356             end_of_file = true;
357         else
358             ++(finfo->line);
359     }
360 
361     return(end_of_file);
362 }
363 
364 /**********************************************************
365  *
366  *   bool SkipTitle(fp, finfo, str, len):
367  *
368  *      Return TRUE if file contains no entry in which no
369  *   match in keyword "str".
370  *      Skip any title declaration lines.
371  *
372  *                                              3-5-93
373  *
374  **********************************************************/
SkipTitle(FILE * fp,FinfoBlkPtr finfo,const char * str,Int2 len)375 bool SkipTitle(FILE* fp, FinfoBlkPtr finfo, const char *str, Int2 len)
376 {
377     bool end_of_file = XReadFile(fp, finfo);
378     while(!end_of_file && StringNCmp(finfo->str, str, len) != 0)
379         end_of_file = XReadFile(fp, finfo);
380 
381     return(end_of_file);
382 }
383 
384 
SkipTitle(FILE * fp,FinfoBlkPtr finfo,const CTempString & keyword)385 bool SkipTitle(FILE* fp, FinfoBlkPtr finfo, const CTempString& keyword)
386 {
387     return SkipTitle(fp, finfo, keyword.data(), keyword.size());
388 }
389 
390 /**********************************************************/
SkipTitleBuf(FileBuf & fbuf,FinfoBlkPtr finfo,const char * str,Int2 len)391 bool SkipTitleBuf(FileBuf& fbuf, FinfoBlkPtr finfo, const char *str, Int2 len)
392 {
393     bool end_of_file = XReadFileBuf(fbuf, finfo);
394     while(!end_of_file && StringNCmp(finfo->str, str, len) != 0)
395         end_of_file = XReadFileBuf(fbuf, finfo);
396 
397     return(end_of_file);
398 }
399 
400 
SkipTitleBuf(FileBuf & fbuf,FinfoBlkPtr finfo,const CTempString & keyword)401 bool SkipTitleBuf(FileBuf& fbuf, FinfoBlkPtr finfo, const CTempString& keyword)
402 {
403     return SkipTitleBuf(fbuf, finfo, keyword.data(), keyword.size());
404 }
405 
406 
407 /**********************************************************
408  *
409  *   static bool CheckLocus(locus):
410  *
411  *      Locus name only allow A-Z, 0-9, characters,
412  *   reject if not.
413  *
414  **********************************************************/
CheckLocus(char * locus,Parser::ESource source)415 static bool CheckLocus(char* locus, Parser::ESource source)
416 {
417     char* p = locus;
418     if(StringNCmp(locus, "SEG_", 4) == 0 &&
419        (source == Parser::ESource::NCBI || source == Parser::ESource::DDBJ))
420         p += 4;
421     for(; *p != '\0'; p++)
422     {
423         if((*p >= '0' && *p <= '9') || (*p >= 'A' && *p <= 'Z') ||
424            (*p == '.' && source == Parser::ESource::Flybase))
425             continue;
426         if(((*p >= 'a' && *p <= 'z') || *p == '_' || *p == '-' || *p == '(' ||
427              *p == ')' || *p == '/') && source == Parser::ESource::Refseq)
428             continue;
429 
430         ErrPostEx(SEV_ERROR, ERR_LOCUS_BadLocusName,
431                   "Bad locusname, <%s> for this entry", locus);
432         break;
433     }
434 
435     return (*p != '\0');
436 }
437 
438 /**********************************************************
439  *
440  *   static bool CheckLocusSP(locus):
441  *
442  *      Locus name consists of up tp 10 uppercase
443  *   alphanumeric characters.
444  *      Rule: X_Y format (SWISS-PROT), reject if not
445  *      - X is a mnemonic code, up to 4 alphanumeric
446  *        characters to represent the protein name.
447  *      - Y is a mnemonic species identification code of
448  *        at most 5 alphanumeric characters to representing
449  *        the biological source of the protein.
450  *      Checking the defined species identification code
451  *   has not been implemented.
452  *
453  *      Example:  RL1_ECOLI   FER_HALHA
454  *
455  **********************************************************/
CheckLocusSP(char * locus)456 static bool CheckLocusSP(char* locus)
457 {
458     char* p;
459     bool underscore = false;
460     Int2    x;
461     Int2    y;
462 
463     for(p = locus, x = y = 0; *p != '\0'; p++)
464     {
465         if((*p >= '0' && *p <= '9') || (*p >= 'A' && *p <= 'Z'))
466         {
467             if (!underscore)
468                 x++;
469             else
470                 y++;
471         }
472         else if(*p == '_')
473             underscore = true;
474         else
475             break;
476     }
477 
478     if(*p != '\0' || x == 0 || y == 0)
479     {
480         ErrPostEx(SEV_ERROR, ERR_LOCUS_BadLocusName,
481                   "Bad locusname, <%s> for this entry", locus);
482         return true;
483     }
484 
485     return false;
486 }
487 
488 /**********************************************************
489  *
490  *   static bool CkDateFormat(date):
491  *
492  *      Return FALSE if date != dd-mmm-yyyy format.
493  *
494  **********************************************************/
CkDateFormat(char * date)495 static bool CkDateFormat(char* date)
496 {
497     if(date[2] == '-' && date[6] == '-' &&
498        IS_DIGIT(date[0]) != 0 && IS_DIGIT(date[1]) != 0 &&
499        IS_DIGIT(date[7]) != 0 && IS_DIGIT(date[8]) != 0 &&
500        IS_DIGIT(date[9]) != 0 && IS_DIGIT(date[10]) != 0 &&
501        MatchArraySubString(month_name, date) != -1)
502         return true;
503 
504     return false;
505 }
506 
507 /**********************************************************/
CheckSTRAND(const char * str)508 Int2 CheckSTRAND(const char* str)
509 {
510     return(fta_StringMatch(ParFlat_STRAND_array, str));
511 }
512 
513 /**********************************************************/
XMLCheckSTRAND(char * str)514 Int2 XMLCheckSTRAND(char* str)
515 {
516     return(StringMatchIcase(XML_STRAND_array, str));
517 }
518 
519 /**********************************************************/
XMLCheckTPG(char * str)520 Int2 XMLCheckTPG(char* str)
521 {
522     Int2 i;
523 
524     i = StringMatchIcase(XML_TPG_array, str);
525     if(i == 0)
526         i++;
527     return(i);
528 }
529 
530 /**********************************************************/
CheckTPG(char * str)531 Int2 CheckTPG(char* str)
532 {
533     return(StringMatchIcase(ParFlat_TPG_array, str));
534 }
535 
536 /**********************************************************/
CheckNADDBJ(char * str)537 Int2 CheckNADDBJ(char* str)
538 {
539     return(fta_StringMatch(ParFlat_NA_array_DDBJ, str));
540 }
541 
542 /**********************************************************/
CheckNA(char * str)543 Int2 CheckNA(char* str)
544 {
545     return(fta_StringMatch(ParFlat_NA_array, str));
546 }
547 
548 /**********************************************************/
CheckDIV(char * str)549 Int2 CheckDIV(char* str)
550 {
551     return(fta_StringMatch(ParFlat_DIV_array, str));
552 }
553 
554 /**********************************************************/
CkLocusLinePos(char * offset,Parser::ESource source,LocusContPtr lcp,bool is_mga)555 bool CkLocusLinePos(char* offset, Parser::ESource source, LocusContPtr lcp, bool is_mga)
556 {
557     Char    date[12];
558     bool ret = true;
559     char* p;
560     Int4    i;
561 
562     p = StringChr(offset, '\n');
563     if(p != NULL)
564         *p = '\0';
565 
566     if(is_mga == false && StringNCmp(offset + lcp->bp, "bp", 2) != 0 &&
567        StringNCmp(offset + lcp->bp, "rc", 2) != 0 &&
568        StringNCmp(offset + lcp->bp, "aa", 2) != 0)
569     {
570         i = lcp->bp + 1;
571         ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
572                   "bp/rc string unrecognized in column %d-%d: %s",
573                   i, i + 1, offset + lcp->bp);
574         ret = false;
575     }
576     if(CheckSTRAND(offset + lcp->strand) == -1)
577     {
578         i = lcp->strand + 1;
579         ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
580                   "Strand unrecognized in column %d-%d : %s",
581                   i, i + 2, offset + lcp->strand);
582     }
583 
584     p = offset + lcp->molecule;
585     if(is_mga)
586     {
587         if(StringNICmp(p, "mRNA", 4) != 0 && StringNCmp(p, "RNA", 3) != 0)
588         {
589             ErrPostEx(SEV_REJECT, ERR_FORMAT_IllegalCAGEMoltype,
590                       "Illegal molecule type provided in CAGE record in LOCUS line: \"%s\". Must be \"mRNA\"or \"RNA\". Entry dropped.",
591                       p);
592             ret = false;
593         }
594     }
595     else if(StringMatchIcase(ParFlat_NA_array, p) == -1)
596     {
597         if(StringMatchIcase(ParFlat_AA_array_DDBJ, p) == -1)
598         {
599             i = lcp->molecule + 1;
600             if(source != Parser::ESource::DDBJ ||
601                StringMatchIcase(ParFlat_NA_array_DDBJ, p) == -1)
602             {
603                 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
604                           "Molecule unrecognized in column %d-%d: %s",
605                           i, i + 5, p);
606                 ret = false;
607             }
608         }
609     }
610 
611     if(CheckTPG(offset + lcp->topology) == -1)
612     {
613         i = lcp->topology + 1;
614         ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
615                   "Topology unrecognized in column %d-%d: %s",
616                   i, i + 7, offset + lcp->topology);
617         ret = false;
618     }
619     if(CheckDIV(offset + lcp->div) == -1)
620     {
621         i = lcp->div + 1;
622         ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
623                   "Division code unrecognized in column %d-%d: %s",
624                   i, i + 2, offset + lcp->div);
625         ret = (source == Parser::ESource::LANL);
626     }
627     MemCpy(date, offset + lcp->date, 11);
628     date[11] = '\0';
629     if(StringNCmp(date, "NODATE", 6) == 0)
630     {
631         ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
632                   "NODATE in LOCUS line will be replaced by current system date");
633     }
634     else if(!CkDateFormat(date))
635     {
636         i = lcp->date + 1;
637         ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
638                   "Date should be in column %d-%d, and format dd-mmm-yyyy: %s",
639                   i, i + 10, date);
640         ret = false;
641     }
642 
643     if(p != NULL)
644         *p = '\n';
645     return(ret);
646 }
647 
648 /**********************************************************
649     *
650     *   CRef<objects::CDate_std> GetUpdateDate(ptr, source):
651     *
652     *      Return NULL if ptr does not have dd-mmm-yyyy format
653     *   or "NODATE"; otherwise, return Date-std pointer.
654     *
655     **********************************************************/
GetUpdateDate(char * ptr,Parser::ESource source)656 CRef<objects::CDate_std> GetUpdateDate(char* ptr, Parser::ESource source)
657 {
658     Char date[12];
659 
660     if (StringNCmp(ptr, "NODATE", 6) == 0)
661         return CRef<objects::CDate_std>(new objects::CDate_std(CTime(CTime::eCurrent)));
662 
663     if (ptr[11] != '\0' && ptr[11] != '\n' && ptr[11] != ' ' &&
664         (source != Parser::ESource::SPROT || ptr[11] != ','))
665         return CRef<objects::CDate_std>();
666 
667     MemCpy(date, ptr, 11);
668     date[11] = '\0';
669 
670     if (!CkDateFormat(date))
671         return CRef<objects::CDate_std>();
672 
673     return get_full_date(ptr, false, source);
674 }
675 
676 
677 /**********************************************************/
fta_check_embl_moltype(char * str)678 static bool fta_check_embl_moltype(char* str)
679 {
680     const char **b;
681     char*    p;
682     char*    q;
683 
684     p = StringChr(str, ';');
685     p = StringChr(p + 1, ';');
686     p = StringChr(p + 1, ';');
687 
688     for(p++; *p == ' ';)
689        p++;
690 
691     q = StringChr(p, ';');
692     *q = '\0';
693 
694     for(b = ValidMolTypes; *b != NULL; b++)
695         if(StringCmp(p, *b) == 0)
696             break;
697 
698     if(*b != NULL)
699     {
700         *q = ';';
701         return true;
702     }
703 
704     ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidIDlineMolType,
705               "Invalid moltype value \"%s\" provided in ID line of EMBL record.",
706               p);
707     *q = ';';
708     return false;
709 }
710 
711 /*********************************************************
712 indexblk_struct constructor
713 **********************************************************/
indexblk_struct()714 indexblk_struct::indexblk_struct() :
715     vernum(0),
716     offset(0),
717     bases(0),
718     segnum(0),
719     segtotal(0),
720     linenum(0),
721     drop(0),
722     len(0),
723     EST(false),
724     STS(false),
725     GSS(false),
726     HTC(false),
727     htg(0),
728     is_contig(false),
729     is_mga(false),
730     origin(false),
731     is_pat(false),
732     is_wgs(false),
733     is_tpa(false),
734     is_tsa(false),
735     is_tls(false),
736     is_tpa_wgs_con(false),
737     tsa_allowed(false),
738     moltype(NULL),
739     gaps(NULL),
740     secaccs(NULL),
741     xip(NULL),
742     embl_new_ID(false),
743     env_sample_qual(false),
744     is_prot(false),
745     organism(NULL),
746     taxid(0),
747     no_gc_warning(false),
748     qsoffset(0),
749     qslength(0),
750     wgs_and_gi(0),
751     got_plastid(false),
752     gc_genomic(0),
753     gc_mito(0),
754     specialist_db(false),
755     inferential(false),
756     experimental(false),
757     submitter_seqid(NULL),
758     ppp(NULL)
759 {
760     acnum[0] = 0;
761     locusname[0] = 0;
762     division[0] = 0;
763     blocusname[0] = 0;
764 
765     MemSet(&lc, 0, sizeof(lc));
766 
767     wgssec[0] = 0;
768 }
769 
isSpace(char c)770 static bool isSpace(char c)
771 {
772     return isspace(c);
773 }
774 
775 
776 static CTempString::const_iterator
sFindNextSpace(const CTempString & tempString,CTempString::const_iterator current_it)777 sFindNextSpace(const CTempString& tempString,
778         CTempString::const_iterator current_it)
779 {
780     return find_if(current_it, tempString.end(), isSpace);
781 }
782 
783 
784 static CTempString::const_iterator
sFindNextNonSpace(const CTempString & tempString,CTempString::const_iterator current_it)785 sFindNextNonSpace(const CTempString& tempString,
786         CTempString::const_iterator current_it)
787 {
788     return find_if_not(current_it, tempString.end(), isSpace);
789 }
790 
791 
sSetLocusLineOffsets(const CTempString & locusLine,LocusCont & offsets)792 static void sSetLocusLineOffsets(const CTempString& locusLine, LocusCont& offsets)
793 {
794     offsets.bases = -1;
795     offsets.bp = -1;
796     offsets.strand = -1;
797     offsets.molecule = -1;
798     offsets.topology = -1;
799     offsets.div = -1;
800     offsets.date = -1;
801 
802     if (locusLine.substr(0,5) != "LOCUS") {
803         // throw an exception - invalid locus line
804     }
805 
806 
807     auto it = sFindNextNonSpace(locusLine, locusLine.begin()+5);
808     if (it == locusLine.end()) {
809         // throw an exception - no locus name
810     }
811 
812     it = sFindNextSpace(locusLine, it);
813     if (it == locusLine.end()) {
814         return;
815     }
816 
817     // find the number of bases
818     it = sFindNextNonSpace(locusLine, it);
819     if (it == locusLine.end()) {
820         return;
821     }
822     auto space_it = sFindNextSpace(locusLine, it);
823     if (NStr::StringToNonNegativeInt(locusLine.substr(it-begin(locusLine), space_it-it)) == -1) {
824         return;
825     }
826 
827     offsets.bases = it - begin(locusLine);
828 
829     it = sFindNextNonSpace(locusLine, space_it);
830     offsets.bp = it - begin(locusLine);
831 
832     it = sFindNextSpace(locusLine, it);
833     it = sFindNextNonSpace(locusLine, it);
834     // the next one might be a strand
835     // or might be a molecule
836     space_it = sFindNextSpace(locusLine, it);
837     offsets.strand = -1;
838     if ((space_it - it)==3) {
839         auto currentSubstr = locusLine.substr(it-begin(locusLine),3);
840         if (currentSubstr=="ss-" ||
841             currentSubstr=="ds-" ||
842             currentSubstr=="ms-") {
843             offsets.strand = it - begin(locusLine);
844             it = sFindNextNonSpace(locusLine, space_it);
845         }
846         offsets.molecule = it - begin(locusLine);
847     }
848     else {
849         offsets.molecule = it - begin(locusLine);
850     }
851 
852     // topology
853     it = sFindNextSpace(locusLine, it);
854     it = sFindNextNonSpace(locusLine, it);
855     if (it != locusLine.end()) {
856         offsets.topology = it - begin(locusLine);
857     }
858 
859     // find division
860     it = sFindNextSpace(locusLine, it);
861     it = sFindNextNonSpace(locusLine, it);
862     if (it != locusLine.end()) {
863         offsets.div = it - begin(locusLine);
864     }
865 
866     // find date - date is optional
867     it = sFindNextSpace(locusLine, it);
868     it = sFindNextNonSpace(locusLine, it);
869     if (it != locusLine.end()) {
870         offsets.date = it - begin(locusLine);
871     }
872 }
873 
874 /**********************************************************
875  *
876  *   IndexblkPtr InitialEntry(pp, finfo):
877  *
878  *      Assign the entry's value to offset, locusname,
879  *   bases, linenum, drop blocusname.
880  *      Swiss-prot locusname checking is different from
881  *   others.
882  *      Check LOCUS line column position, genbank format.
883  *
884  **********************************************************/
InitialEntry(ParserPtr pp,FinfoBlkPtr finfo)885 IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlkPtr finfo)
886 {
887     Int2            i;
888     Int2            j;
889     TokenStatBlkPtr stoken;
890     TokenBlkPtr     ptr;
891     char*         bases;
892     IndexblkPtr     entry;
893     char*         p;
894 
895     entry = new Indexblk;
896 
897     entry->offset = finfo->pos;
898     entry->linenum = finfo->line;
899     entry->ppp = pp;
900     entry->is_tsa = false;
901     entry->is_tls = false;
902     entry->is_pat = false;
903 
904     if(pp->source == Parser::ESource::PRF)
905         stoken = TokenString(finfo->str, ';');
906     else
907         stoken = TokenString(finfo->str, ' ');
908 
909     bool badlocus = false;
910     if(stoken->num > 2 || (pp->format == Parser::EFormat::PRF && stoken->num > 1))
911     {
912         p = finfo->str;
913         if (pp->mode == Parser::EMode::Relaxed) {
914             sSetLocusLineOffsets(p, entry->lc);
915         } else {
916             if(StringLen(p) > 78 && p[28] == ' ' && p[63] == ' ' && p[67] == ' ')
917             {
918                 entry->lc.bases = ParFlat_COL_BASES_NEW;
919                 entry->lc.bp = ParFlat_COL_BP_NEW;
920                 entry->lc.strand = ParFlat_COL_STRAND_NEW;
921                 entry->lc.molecule = ParFlat_COL_MOLECULE_NEW;
922                 entry->lc.topology = ParFlat_COL_TOPOLOGY_NEW;
923                 entry->lc.div = ParFlat_COL_DIV_NEW;
924                 entry->lc.date = ParFlat_COL_DATE_NEW;
925             }
926             else
927             {
928                 entry->lc.bases = ParFlat_COL_BASES;
929                 entry->lc.bp = ParFlat_COL_BP;
930                 entry->lc.strand = ParFlat_COL_STRAND;
931                 entry->lc.molecule = ParFlat_COL_MOLECULE;
932                 entry->lc.topology = ParFlat_COL_TOPOLOGY;
933                 entry->lc.div = ParFlat_COL_DIV;
934                 entry->lc.date = ParFlat_COL_DATE;
935             }
936         }
937 
938         ptr = stoken->list->next;
939         if(pp->format == Parser::EFormat::EMBL && ptr->next != NULL &&
940            ptr->next->str != NULL && StringCmp(ptr->next->str, "SV") == 0)
941         {
942             for(i = 0, p = finfo->str; *p != '\0'; p++)
943                 if(*p == ';' && p[1] == ' ')
944                     i++;
945 
946             entry->embl_new_ID = true;
947             p = StringRChr(ptr->str, ';');
948             if(p != NULL && p[1] == '\0')
949                 *p = '\0';
950 
951             FtaInstallPrefix(PREFIX_LOCUS, ptr->str, NULL);
952             FtaInstallPrefix(PREFIX_ACCESSION, ptr->str, NULL);
953 
954             if(i != 6 || (stoken->num != 10 && stoken->num != 11))
955             {
956                 ErrPostEx(SEV_REJECT, ERR_FORMAT_BadlyFormattedIDLine,
957                           "The number of fields in this EMBL record's new ID line does not fit requirements.");
958                 badlocus = true;
959             }
960             else if(fta_check_embl_moltype(finfo->str) == false)
961                 badlocus = true;
962         }
963 
964         StringCpy(entry->locusname, ptr->str);
965         StringCpy(entry->blocusname, entry->locusname);
966         if(pp->format == Parser::EFormat::PIR || pp->format == Parser::EFormat::PRF)
967             StringCpy(entry->acnum, entry->locusname);
968 
969         if(entry->embl_new_ID == false)
970         {
971             FtaInstallPrefix(PREFIX_LOCUS, entry->locusname, NULL);
972             FtaInstallPrefix(PREFIX_ACCESSION, entry->locusname, NULL);
973         }
974 
975         if(pp->mode != Parser::EMode::Relaxed && !badlocus)
976         {
977             if(pp->format == Parser::EFormat::SPROT)
978             {
979                 if(ptr->next == NULL || ptr->next->str == NULL ||
980                    (StringNICmp(ptr->next->str, "preliminary", 11) != 0 &&
981                     StringNICmp(ptr->next->str, "unreviewed", 10) != 0))
982                     badlocus = CheckLocusSP(entry->locusname);
983                 else
984                     badlocus = false;
985             }
986             else if(pp->format == Parser::EFormat::PIR || pp->format == Parser::EFormat::PRF)
987                 badlocus = false;
988             else
989                 badlocus = CheckLocus(entry->locusname, pp->source);
990         }
991     }
992     else if (pp->mode != Parser::EMode::Relaxed)
993     {
994         badlocus = true;
995         ErrPostStr(SEV_ERROR, ERR_LOCUS_NoLocusName,
996                    "No locus name for this entry");
997     }
998 
999     if(badlocus)
1000     {
1001         p = StringChr(finfo->str, '\n');
1002         if(p != NULL)
1003             *p = '\0';
1004         ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1005                   "Entry skipped. LOCUS line = \"%s\".", finfo->str);
1006         if(p != NULL)
1007             *p = '\n';
1008         MemFree(entry);
1009         FreeTokenstatblk(stoken);
1010         return(NULL);
1011     }
1012 
1013     if(pp->format == Parser::EFormat::PIR || pp->format == Parser::EFormat::PRF)
1014     {
1015         FreeTokenstatblk(stoken);
1016         return(entry);
1017     }
1018 
1019     bases = GetResidue(stoken);
1020     if(bases != NULL)
1021         entry->bases = (size_t) atoi(bases);
1022 
1023     if(pp->format == Parser::EFormat::GenBank &&
1024        entry->lc.date > -1)
1025     {
1026         /* last token in the LOCUS line is date of the update's data
1027          */
1028         for(i = 1, ptr = stoken->list; i < stoken->num; i++)
1029             ptr = ptr->next;
1030         entry->date = GetUpdateDate(ptr->str, pp->source);
1031     }
1032 
1033     if(pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL)
1034     {
1035         j = stoken->num - ((pp->format == Parser::EFormat::GenBank) ? 2 : 3);
1036         for(i = 1, ptr = stoken->list; i < j; i++)
1037             ptr = ptr->next;
1038 
1039         if(pp->format == Parser::EFormat::EMBL)
1040         {
1041             if(StringNICmp(ptr->str, "TSA", 3) == 0)
1042                 entry->is_tsa = true;
1043             else if(StringNICmp(ptr->str, "PAT", 3) == 0)
1044                 entry->is_pat = true;
1045         }
1046 
1047         ptr = ptr->next;
1048 
1049         if(StringNICmp(ptr->str, "EST", 3) == 0)
1050             entry->EST = true;
1051         else if(StringNICmp(ptr->str, "STS", 3) == 0)
1052             entry->STS = true;
1053         else if(StringNICmp(ptr->str, "GSS", 3) == 0)
1054             entry->GSS = true;
1055         else if(StringNICmp(ptr->str, "HTC", 3) == 0)
1056             entry->HTC = true;
1057         else if(StringNICmp(ptr->str, "PAT", 3) == 0 &&
1058                 pp->source == Parser::ESource::EMBL)
1059             entry->is_pat = true;
1060     }
1061     FreeTokenstatblk(stoken);
1062 
1063     return(entry);
1064 }
1065 
1066 /**********************************************************
1067  *
1068  *   void DelNoneDigitTail(str):
1069  *
1070  *      Delete any non digit characters from tail
1071  *   of string "str".
1072  *
1073  *                                              3-25-93
1074  *
1075  **********************************************************/
DelNoneDigitTail(char * str)1076 void DelNoneDigitTail(char* str)
1077 {
1078     char* p;
1079 
1080     if(str == NULL || *str == '\0')
1081         return;
1082 
1083     for(p = str; *str != '\0'; str++)
1084         if(*str >= '0' && *str <= '9')
1085             p = str + 1;
1086 
1087     *p = '\0';
1088 }
1089 
sDelNonDigitTail(string & str)1090 static void sDelNonDigitTail(string& str)
1091 {
1092     if (str.empty()) {
1093         return;
1094     }
1095     auto nondigitPos = str.find_first_not_of("0123456789");
1096     if (nondigitPos != string::npos) {
1097         str = str.substr(0,nondigitPos);
1098     }
1099 }
1100 
1101 
1102 /**********************************************************
1103  *
1104  * Here X is an alpha character, N - numeric one.
1105  * Return values:
1106  *
1107  * 1 - XXN      (AB123456)
1108  * 2 - XX_N     (NZ_123456)
1109  * 3 - XXXXN    (AAAA01000001)
1110  * 4 - XX_XXXXN (NZ_AAAA01000001)
1111  * 5 - XXXXXN   (AAAAA1234512)
1112  * 6 - XX_XXN   (NZ_AB123456)
1113  * 7 - XXXXNNSN (AAAA01S000001 - scaffolds)
1114  * 8 - XXXXXXN  (AAAAAA010000001)
1115  * 0 - all others
1116  *
1117  */
IsNewAccessFormat(const Char * acnum)1118 Int4 IsNewAccessFormat(const Char* acnum)
1119 {
1120     const Char* p = acnum;
1121 
1122     if(p == NULL || *p == '\0')
1123         return(0);
1124 
1125     if(p[0] >= 'A' && p[0] <= 'Z' && p[1] >= 'A' && p[1] <= 'Z')
1126     {
1127         if(p[2] >= '0' && p[2] <= '9')
1128             return(1);
1129         if(p[2] == '_')
1130         {
1131             if(p[3] >= '0' && p[3] <= '9')
1132                 return(2);
1133             if(p[3] >= 'A' && p[3] <= 'Z' && p[4] >= 'A' && p[4] <= 'Z')
1134             {
1135                 if(p[5] >= 'A' && p[5] <= 'Z' && p[6] >= 'A' && p[6] <= 'Z' &&
1136                    p[7] >= '0' && p[7] <= '9')
1137                     return(4);
1138                 if(p[5] >= '0' && p[5] <= '9')
1139                     return(6);
1140             }
1141         }
1142         if(p[2] >= 'A' && p[2] <= 'Z' && p[3] >= 'A' && p[3] <= 'Z')
1143         {
1144             if(p[4] >= 'A' && p[4] <= 'Z' && p[5] >= 'A' && p[5] <= 'Z' &&
1145                p[6] >= '0' && p[6] <= '9')
1146                 return(8);
1147             if(p[4] >= '0' && p[4] <= '9')
1148             {
1149                 if(p[5] >= '0' && p[5] <= '9' && p[6] == 'S' &&
1150                    p[7] >= '0' && p[7] <= '9')
1151                     return(7);
1152                 return(3);
1153             }
1154 
1155             if(p[4] >= 'A' && p[4] <= 'Z' && p[5] >= '0' && p[6] <= '9')
1156                 return(5);
1157         }
1158     }
1159     return(0);
1160 }
1161 
1162 /**********************************************************/
IsValidAccessPrefix(const char * acc,char ** accpref)1163 static bool IsValidAccessPrefix(const char* acc, char** accpref)
1164 {
1165     Int4 i = IsNewAccessFormat(acc);
1166     if(i == 0 || accpref == NULL)
1167         return false;
1168 
1169     if(i > 2 && i < 9)
1170         return true;
1171 
1172     char** b = accpref;
1173     for (; *b != NULL; b++)
1174     {
1175         if (StringNCmp(acc, *b, StringLen(*b)) == 0)
1176             break;
1177     }
1178 
1179     return (*b != NULL);
1180 }
1181 
1182 /**********************************************************/
fta_if_master_wgs_accession(const char * acnum,Int4 accformat)1183 static bool fta_if_master_wgs_accession(const char* acnum, Int4 accformat)
1184 {
1185     const char* p;
1186 
1187     if(accformat == 3)
1188         p = acnum + 4;
1189     else if(accformat == 8)
1190         p = acnum + 6;
1191     else if(accformat == 4)
1192         p = acnum + 7;
1193     else
1194         return false;
1195 
1196     if(p[0] >= '0' && p[0] <= '9' && p[1] >= '0' && p[1] <= '9')
1197     {
1198         for(p += 2; *p == '0';)
1199             p++;
1200         if(*p == '\0')
1201             return true;
1202         return false;
1203     }
1204     return false;
1205 }
1206 
1207 
s_IsVDBWGSScaffold(const CTempString & accession)1208 static bool s_IsVDBWGSScaffold(const CTempString& accession)
1209 {
1210     // 4+2+S+[6,7,8]
1211     if (accession.length() < 13 ||
1212         accession.length() > 15 ||
1213         accession[6] != 'S') {
1214         return false;
1215     }
1216 
1217     // check that the first 4 chars are letters
1218     if (any_of(begin(accession),
1219               begin(accession)+4,
1220               [](const char c){ return !isalpha(c); })) {
1221         return false;
1222     }
1223 
1224     // check that the next 2 chars are letters
1225     if (!isdigit(accession[4]) ||
1226         !isdigit(accession[5])) {
1227         return false;
1228     }
1229 
1230     // The characters after 'S' should all be digits
1231     // with at least one non-zero digit
1232 
1233     // First check for digits
1234     if (any_of(begin(accession)+7,
1235                end(accession),
1236                [](const char c){ return !isdigit(c); })) {
1237         return false;
1238     }
1239 
1240     // Now check to see if at least one is not zero
1241     if (all_of(begin(accession)+7,
1242                end(accession),
1243                [](const char c) { return c == '0'; })) {
1244         return false;
1245     }
1246 
1247     return true;
1248 }
1249 
s_RefineWGSType(const CTempString & accession,int initialType)1250 static int s_RefineWGSType(const CTempString& accession, int initialType)
1251 {
1252     if (initialType == -1) {
1253         return initialType;
1254     }
1255         // Identify as TSA or TLS
1256     if(accession[0] == 'G')                       /* TSA-WGS */
1257     {
1258         switch(initialType)
1259         {
1260         case 0:
1261             return 4;
1262         case 1:
1263             return 5;
1264         case 3:
1265             return 6;
1266         default:
1267             return initialType;
1268         }
1269     }
1270 
1271     if (accession[0] == 'K' || accession[1] == 'T') { // TLS
1272         switch(initialType)
1273         {
1274         case 0:
1275             return 10;
1276         case 1:
1277             return 11;
1278         case 3:
1279             return 12;
1280         default:
1281             return initialType;
1282         }
1283     }
1284 
1285     if (initialType == 1) { // TSA again
1286         if (accession[0] == 'I') {
1287             return 8;
1288         }
1289         if (accession[0] == 'H') {
1290             return 9;
1291         }
1292     }
1293 
1294     return initialType;
1295 }
1296 
1297 /**********************************************************/
1298 /* Returns:  0 - if WGS project accession;
1299  *           1 - WGS contig accession;
1300  *           2 - WGS scaffold accession (2+6);
1301  *           3 - WGS master accession (XXXX00000000);
1302  *           4 - TSA-WGS project accession;
1303  *           5 - TSA-WGS contig accession
1304  *           6 - TSA-WGS master accession;
1305  *           7 - VDB WGS scaffold accession (4+2+S+[6,7,8]);
1306  *           8 - TSA-WGS contig DDBJ accession
1307  *           9 - TSA-WGS contig EMBL accession
1308  *          10 - TLS-WGS project accession;
1309  *          11 - TLS-WGS contig accession
1310  *          12 - TLS-WGS master accession;
1311  *          -1 - something else.
1312  */
fta_if_wgs_acc(const CTempString & accession)1313 int fta_if_wgs_acc(const CTempString& accession)
1314 {
1315 
1316     if (accession.empty() ||
1317         NStr::IsBlank(accession)) {
1318         return -1;
1319     }
1320 
1321     const auto length = accession.length();
1322 
1323     if(length == 8 &&
1324        k_WgsScaffoldPrefix.find(accession.substr(0,2)) != k_WgsScaffoldPrefix.end() &&
1325        all_of(begin(accession)+2, end(accession), [](const char c) { return isdigit(c); })) {
1326         return 2;
1327     }
1328 
1329     if(length > 12 && length < 16 && accession[6] == 'S')
1330     {
1331         if (s_IsVDBWGSScaffold(accession)) {
1332             return 7;
1333         }
1334         return -1;
1335     }
1336 
1337     const char* p = accession.data();
1338     if(StringNCmp(p, "NZ_", 3) == 0) {
1339         p += 3;
1340     }
1341     size_t j = StringLen(p);
1342     if(j < 12 || j > 17) {
1343         return -1;
1344     }
1345 
1346     if(isdigit(p[4]))
1347     {
1348         if(all_of(p, p+4, [](const char c) { return isalpha(c); }) &&
1349            all_of(p+4, end(accession), [](const char c) { return isdigit(c); })) {
1350 
1351             int i = -1;
1352             if (any_of(p+6, end(accession), [](const char c) { return c != '0'; })) {
1353                 i = 1; // WGS contig
1354             }
1355             else
1356             if (p[4] == '0' && p[5] == '0') {
1357                 i = 3; // WGS master
1358             }
1359             else {
1360                 i = 0; // WGS project
1361             }
1362             return s_RefineWGSType(p, i);
1363         }
1364         return -1;
1365     }
1366 
1367 
1368     // 6 letters + 2 digits
1369     if (all_of(p, p+6, [](const char c){ return isalpha(c); }) &&
1370         all_of(p+6, end(accession), [](const char c) { return isdigit(c); })) {
1371 
1372         if (any_of(p+8, end(accession), [](const char c) { return c != '0'; })) {
1373             return 1; // WGS contig
1374         }
1375 
1376         if (p[6] == '0' && p[7] == '0') {
1377             return 3; // WGS master
1378         }
1379         return 0; // WGS project
1380     }
1381 
1382     return -1; // unknown
1383 }
1384 
1385 /**********************************************************/
IsSPROTAccession(const char * acc)1386 bool IsSPROTAccession(const char* acc)
1387 {
1388     const char **b;
1389 
1390     if(acc == NULL || acc[0] == '\0')
1391         return false;
1392     size_t len = StringLen(acc);
1393     if(len != 6 && len != 8 && len != 10)
1394         return false;
1395     if(len == 8)
1396     {
1397         for (b = sprot_accpref; *b != NULL; b++)
1398         {
1399             if (StringNCmp(*b, acc, 2) == 0)
1400                 break;
1401         }
1402 
1403         return (*b != NULL);
1404     }
1405 
1406     if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < '0' || acc[1] > '9' ||
1407        ((acc[3] < '0' || acc[3] > '9') && (acc[3] < 'A' || acc[3] > 'Z')) ||
1408        ((acc[4] < '0' || acc[4] > '9') && (acc[4] < 'A' || acc[4] > 'Z')) ||
1409        acc[5] < '0' || acc[5] > '9')
1410         return false;
1411 
1412     if(acc[0] >= 'O' && acc[0] <= 'Q')
1413     {
1414         if((acc[2] < '0' || acc[2] > '9') && (acc[2] < 'A' || acc[2] > 'Z'))
1415             return false;
1416     }
1417     else if(acc[2] < 'A' || acc[2] > 'Z')
1418         return false;
1419 
1420     if(len == 6)
1421         return true;
1422 
1423     if(acc[0] >= 'O' && acc[0] <= 'Q')
1424         return false;
1425 
1426     if(acc[6] < 'A' || acc[6] > 'Z' || acc[9] < '0' || acc[9] > '9' ||
1427        ((acc[7] < 'A' || acc[7] > 'Z') && (acc[7] < '0' || acc[7] > '9')) ||
1428        ((acc[8] < 'A' || acc[8] > 'Z') && (acc[8] < '0' || acc[8] > '9')))
1429         return false;
1430 
1431     return true;
1432 }
1433 
1434 
1435 
sCheckAccession(const list<string> & tokens,Parser::ESource source,Parser::EMode mode,const char * priacc,int skip)1436 static bool sCheckAccession(const list<string>& tokens,
1437                             Parser::ESource source,
1438                             Parser::EMode mode,
1439                             const char* priacc, int skip)
1440 {
1441     TokenBlkPtr tbp;
1442     bool        badac;
1443     bool        res = true;
1444     bool        iswgs;
1445     Char        acnum[200];
1446     Int4        accformat;
1447     Int4        priformat;
1448     Int4        count;
1449     size_t        i;
1450 
1451     if(priacc == NULL || mode == Parser::EMode::Relaxed)
1452         return true;
1453 
1454     auto it = tokens.begin();
1455     if (skip) {
1456         advance(it, skip);
1457     }
1458 
1459     priformat = IsNewAccessFormat(priacc);
1460     if((priformat == 3 || priformat == 4 || priformat == 8) &&
1461        fta_if_master_wgs_accession(priacc, priformat) == false)
1462         iswgs = true;
1463     else
1464         iswgs = false;
1465 
1466     count = 0;
1467     for(; it != tokens.end(); ++it)
1468     {
1469         StringCpy(acnum, it->c_str());
1470         if(acnum[0] == '-' && acnum[1] == '\0')
1471             continue;
1472 
1473         if(skip == 2 && count == 0)
1474             accformat = priformat;
1475         else
1476             accformat = IsNewAccessFormat(acnum);
1477 
1478         size_t len = StringLen(acnum);
1479         if(acnum[len-1] == ';')
1480         {
1481             len--;
1482             acnum[len] = '\0';
1483         }
1484         badac = false;
1485         if(accformat == 1)
1486         {
1487             if(len != 8 && len != 10)
1488                 badac = true;
1489             else
1490             {
1491                 for(i = 2; i < 8 && badac == false; i++)
1492                     if(acnum[i] < '0' || acnum[i] > '9')
1493                         badac = true;
1494             }
1495         }
1496         else if(accformat == 2)
1497         {
1498             if(len != 9 && len != 12)
1499                 badac = true;
1500             else
1501             {
1502                 for(i = 3; i < len && badac == false; i++)
1503                     if(acnum[i] < '0' || acnum[i] > '9')
1504                         badac = true;
1505             }
1506         }
1507         else if(accformat == 3)
1508         {
1509             if(len < 12 || len > 14)
1510                 badac = true;
1511             else
1512             {
1513                 for(i = 4; i < len && badac == false; i++)
1514                     if(acnum[i] < '0' || acnum[i] > '9')
1515                         badac = true;
1516             }
1517         }
1518         else if(accformat == 8)
1519         {
1520             if(len < 15 || len > 17)
1521                 badac = true;
1522             else
1523             {
1524                 for(i = 6; i < len && !badac; i++)
1525                     if(acnum[i] < '0' || acnum[i] > '9')
1526                         badac = true;
1527             }
1528         }
1529         else if(accformat == 4)
1530         {
1531             if(len < 15 || len > 17)
1532                 badac = true;
1533             else
1534             {
1535                 for(i = 7; i < len && badac == false; i++)
1536                     if(acnum[i] < '0' || acnum[i] > '9')
1537                         badac = true;
1538             }
1539         }
1540         else if(accformat == 5)
1541         {
1542             if(len != 12)
1543                 badac = true;
1544             else
1545             {
1546                 for(i = 5; i < len && badac == false; i++)
1547                     if(acnum[i] < '0' || acnum[i] > '9')
1548                         badac = true;
1549             }
1550         }
1551         else if(accformat == 6)
1552         {
1553             if(len != 11 || acnum[0] != 'N' || acnum[1] != 'Z' ||
1554                acnum[2] != '_' || acnum[3] < 'A' || acnum[3] > 'Z' ||
1555                acnum[4] < 'A' || acnum[4] > 'Z')
1556                 badac = true;
1557             else
1558             {
1559                 for(i = 5; i < len && badac == false; i++)
1560                     if(acnum[i] < '0' || acnum[i] > '9')
1561                         badac = true;
1562             }
1563         }
1564         else if(accformat == 7)
1565         {
1566             if(len < 13 || len > 15)
1567                 badac = true;
1568             else
1569             {
1570                 for(i = 7; i < len && badac == false; i++)
1571                     if(acnum[i] < '0' || acnum[i] > '9')
1572                         badac = true;
1573             }
1574         }
1575         else if(accformat == 0)
1576         {
1577             if(len != 6 && len != 10)
1578                 badac = true;
1579             else if(acnum[0] >= 'A' && acnum[0] <= 'Z')
1580             {
1581                 if(source == Parser::ESource::SPROT)
1582                 {
1583                     if(!IsSPROTAccession(acnum))
1584                         badac = true;
1585                 }
1586                 else if(len == 10)
1587                 {
1588                     badac = true;
1589                 }
1590                 else
1591                 {
1592                     for(i = 1; i < 6 && badac == false; i++)
1593                         if(acnum[i] < '0' || acnum[i] > '9')
1594                             badac = true;
1595                 }
1596             }
1597             else
1598                 badac = true;
1599         }
1600         else
1601             badac = true;
1602 
1603         if(badac)
1604         {
1605             ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
1606                       "Bad accession #, %s for this entry", acnum);
1607             res = false;
1608             count++;
1609             continue;
1610         }
1611 
1612         if(skip == 2 && count == 0 && !iswgs &&
1613            (accformat == 3 || accformat == 4 || accformat == 8))
1614         {
1615             ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSProjectAccIsPri,
1616                       "This record has a WGS 'project' accession as its primary accession number. WGS project-accessions are only expected to be used as secondary accession numbers.");
1617             res = false;
1618         }
1619         count++;
1620     }
1621 
1622     return(res);
1623 }
1624 
1625 /**********************************************************
1626  *
1627  *   static bool CheckAccession(stoken, source, entryacc,
1628  *                                 skip):
1629  *
1630  *      A valid accession number should be an upper case
1631  *   letter (A-Z) followed by 5 digits, put "reject" message
1632  *   if not.
1633  *
1634  *                                              7-6-93
1635  *
1636  **********************************************************/
CheckAccession(TokenStatBlkPtr stoken,Parser::ESource source,Parser::EMode mode,char * priacc,Int4 skip)1637 static bool CheckAccession(TokenStatBlkPtr stoken,
1638                            Parser::ESource source,
1639                            Parser::EMode mode,
1640                            char* priacc, Int4 skip)
1641 {
1642     TokenBlkPtr tbp;
1643     bool        badac;
1644     bool        res = true;
1645     bool        iswgs;
1646     Char        acnum[200];
1647     Int4        accformat;
1648     Int4        priformat;
1649     Int4        count;
1650     size_t        i;
1651 
1652     if(priacc == NULL || mode == Parser::EMode::Relaxed)
1653         return true;
1654 
1655     tbp = (skip == 0) ? stoken->list : stoken->list->next;
1656     priformat = IsNewAccessFormat(priacc);
1657     if((priformat == 3 || priformat == 4 || priformat == 8) &&
1658        fta_if_master_wgs_accession(priacc, priformat) == false)
1659         iswgs = true;
1660     else
1661         iswgs = false;
1662 
1663     count = 0;
1664     for(; tbp != NULL; tbp = tbp->next)
1665     {
1666         StringCpy(acnum, tbp->str);
1667         if(acnum[0] == '-' && acnum[1] == '\0')
1668             continue;
1669 
1670         if(skip == 2 && count == 0)
1671             accformat = priformat;
1672         else
1673             accformat = IsNewAccessFormat(acnum);
1674 
1675         size_t len = StringLen(acnum);
1676         if(acnum[len-1] == ';')
1677         {
1678             len--;
1679             acnum[len] = '\0';
1680         }
1681         badac = false;
1682         if(accformat == 1)
1683         {
1684             if(len != 8 && len != 10)
1685                 badac = true;
1686             else
1687             {
1688                 for(i = 2; i < 8 && badac == false; i++)
1689                     if(acnum[i] < '0' || acnum[i] > '9')
1690                         badac = true;
1691             }
1692         }
1693         else if(accformat == 2)
1694         {
1695             if(len != 9 && len != 12)
1696                 badac = true;
1697             else
1698             {
1699                 for(i = 3; i < len && badac == false; i++)
1700                     if(acnum[i] < '0' || acnum[i] > '9')
1701                         badac = true;
1702             }
1703         }
1704         else if(accformat == 3)
1705         {
1706             if(len < 12 || len > 14)
1707                 badac = true;
1708             else
1709             {
1710                 for(i = 4; i < len && badac == false; i++)
1711                     if(acnum[i] < '0' || acnum[i] > '9')
1712                         badac = true;
1713             }
1714         }
1715         else if(accformat == 8)
1716         {
1717             if(len < 15 || len > 17)
1718                 badac = true;
1719             else
1720             {
1721                 for(i = 6; i < len && !badac; i++)
1722                     if(acnum[i] < '0' || acnum[i] > '9')
1723                         badac = true;
1724             }
1725         }
1726         else if(accformat == 4)
1727         {
1728             if(len < 15 || len > 17)
1729                 badac = true;
1730             else
1731             {
1732                 for(i = 7; i < len && badac == false; i++)
1733                     if(acnum[i] < '0' || acnum[i] > '9')
1734                         badac = true;
1735             }
1736         }
1737         else if(accformat == 5)
1738         {
1739             if(len != 12)
1740                 badac = true;
1741             else
1742             {
1743                 for(i = 5; i < len && badac == false; i++)
1744                     if(acnum[i] < '0' || acnum[i] > '9')
1745                         badac = true;
1746             }
1747         }
1748         else if(accformat == 6)
1749         {
1750             if(len != 11 || acnum[0] != 'N' || acnum[1] != 'Z' ||
1751                acnum[2] != '_' || acnum[3] < 'A' || acnum[3] > 'Z' ||
1752                acnum[4] < 'A' || acnum[4] > 'Z')
1753                 badac = true;
1754             else
1755             {
1756                 for(i = 5; i < len && badac == false; i++)
1757                     if(acnum[i] < '0' || acnum[i] > '9')
1758                         badac = true;
1759             }
1760         }
1761         else if(accformat == 7)
1762         {
1763             if(len < 13 || len > 15)
1764                 badac = true;
1765             else
1766             {
1767                 for(i = 7; i < len && badac == false; i++)
1768                     if(acnum[i] < '0' || acnum[i] > '9')
1769                         badac = true;
1770             }
1771         }
1772         else if(accformat == 0)
1773         {
1774             if(len != 6 && len != 10)
1775                 badac = true;
1776             else if(acnum[0] >= 'A' && acnum[0] <= 'Z')
1777             {
1778                 if(source == Parser::ESource::SPROT)
1779                 {
1780                     if(!IsSPROTAccession(acnum))
1781                         badac = true;
1782                 }
1783                 else if(len == 10)
1784                 {
1785                     badac = true;
1786                 }
1787                 else
1788                 {
1789                     for(i = 1; i < 6 && badac == false; i++)
1790                         if(acnum[i] < '0' || acnum[i] > '9')
1791                             badac = true;
1792                 }
1793             }
1794             else
1795                 badac = true;
1796         }
1797         else
1798             badac = true;
1799 
1800         if(badac)
1801         {
1802             ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
1803                       "Bad accession #, %s for this entry", acnum);
1804             res = false;
1805             count++;
1806             continue;
1807         }
1808 
1809         if(skip == 2 && count == 0 && !iswgs &&
1810            (accformat == 3 || accformat == 4 || accformat == 8))
1811         {
1812             ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSProjectAccIsPri,
1813                       "This record has a WGS 'project' accession as its primary accession number. WGS project-accessions are only expected to be used as secondary accession numbers.");
1814             res = false;
1815         }
1816         count++;
1817     }
1818 
1819     return(res);
1820 }
1821 
1822 /**********************************************************/
IsPatentedAccPrefix(const Parser & parseInfo,const char * acc)1823 static bool IsPatentedAccPrefix(const Parser& parseInfo, const char* acc)
1824 {
1825     if(acc[2] == '\0')
1826     {
1827         if((StringCmp(acc, "AR") == 0 || StringCmp(acc, "DZ") == 0 ||
1828             StringCmp(acc, "EA") == 0 || StringCmp(acc, "GC") == 0 ||
1829             StringCmp(acc, "GP") == 0 || StringCmp(acc, "GV") == 0 ||
1830             StringCmp(acc, "GX") == 0 || StringCmp(acc, "GY") == 0 ||
1831             StringCmp(acc, "GZ") == 0 || StringCmp(acc, "HJ") == 0 ||
1832             StringCmp(acc, "HK") == 0 || StringCmp(acc, "HL") == 0 ||
1833             StringCmp(acc, "KH") == 0 || StringCmp(acc, "MI") == 0 ||
1834             StringCmp(acc, "MM") == 0 || StringCmp(acc, "MO") == 0) &&
1835            (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1836             return true;
1837         if((StringNCmp(acc, "AX", 2) == 0 || StringNCmp(acc, "CQ", 2) == 0 ||
1838             StringNCmp(acc, "CS", 2) == 0 || StringNCmp(acc, "FB", 2) == 0 ||
1839             StringNCmp(acc, "HA", 2) == 0 || StringNCmp(acc, "HB", 2) == 0 ||
1840             StringNCmp(acc, "HC", 2) == 0 || StringNCmp(acc, "HD", 2) == 0 ||
1841             StringNCmp(acc, "HH", 2) == 0 || StringNCmp(acc, "GM", 2) == 0 ||
1842             StringNCmp(acc, "GN", 2) == 0 || StringNCmp(acc, "JA", 2) == 0 ||
1843             StringNCmp(acc, "JB", 2) == 0 || StringNCmp(acc, "JC", 2) == 0 ||
1844             StringNCmp(acc, "JD", 2) == 0 || StringNCmp(acc, "JE", 2) == 0 ||
1845             StringNCmp(acc, "HI", 2) == 0 || StringNCmp(acc, "LP", 2) == 0 ||
1846             StringNCmp(acc, "LQ", 2) == 0 || StringNCmp(acc, "MP", 2) == 0 ||
1847             StringNCmp(acc, "MQ", 2) == 0 || StringNCmp(acc, "MR", 2) == 0 ||
1848             StringNCmp(acc, "MS", 2) == 0) &&
1849            (parseInfo.all == true || parseInfo.source == Parser::ESource::EMBL))
1850            return true;
1851         if ((StringNCmp(acc, "BD", 2) == 0 || StringNCmp(acc, "DD", 2) == 0 ||
1852             StringNCmp(acc, "DI", 2) == 0 || StringNCmp(acc, "DJ", 2) == 0 ||
1853             StringNCmp(acc, "DL", 2) == 0 || StringNCmp(acc, "DM", 2) == 0 ||
1854             StringNCmp(acc, "FU", 2) == 0 || StringNCmp(acc, "FV", 2) == 0 ||
1855             StringNCmp(acc, "FW", 2) == 0 || StringNCmp(acc, "FZ", 2) == 0 ||
1856             StringNCmp(acc, "GB", 2) == 0 || StringNCmp(acc, "HV", 2) == 0 ||
1857             StringNCmp(acc, "HW", 2) == 0 || StringNCmp(acc, "HZ", 2) == 0 ||
1858             StringNCmp(acc, "LF", 2) == 0 || StringNCmp(acc, "LG", 2) == 0 ||
1859             StringNCmp(acc, "LV", 2) == 0 || StringNCmp(acc, "LX", 2) == 0 ||
1860             StringNCmp(acc, "LY", 2) == 0 || StringNCmp(acc, "LZ", 2) == 0 ||
1861             StringNCmp(acc, "MA", 2) == 0 || StringNCmp(acc, "MB", 2) == 0 ||
1862             StringNCmp(acc, "MC", 2) == 0 || StringNCmp(acc, "MD", 2) == 0 ||
1863             StringNCmp(acc, "ME", 2) == 0 || StringNCmp(acc, "OF", 2) == 0) &&
1864            (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1865            return true;
1866 
1867         return false;
1868     }
1869 
1870     if(acc[1] == '\0' && (*acc == 'I' || *acc == 'A' || *acc == 'E'))
1871     {
1872         if(parseInfo.all == true ||
1873            (*acc == 'I' && parseInfo.source == Parser::ESource::NCBI) ||
1874            (*acc == 'A' && parseInfo.source == Parser::ESource::EMBL) ||
1875            (*acc == 'E' && parseInfo.source == Parser::ESource::DDBJ))
1876            return true;
1877     }
1878     return false;
1879 }
1880 
1881 /**********************************************************/
IsTPAAccPrefix(const Parser & parseInfo,const char * acc)1882 static bool IsTPAAccPrefix(const Parser& parseInfo, const char* acc)
1883 {
1884     if(acc == NULL)
1885         return(false);
1886 
1887     size_t i = StringLen(acc);
1888     if(i != 2 && i != 4)
1889         return(false);
1890 
1891     if(i == 4)
1892     {
1893         if(acc[0] == 'D' &&
1894            (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1895             return(true);
1896         if(acc[0] == 'E' &&
1897            (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1898             return(true);
1899         return(false);
1900     }
1901 
1902     if(fta_StringMatch(ncbi_tpa_accpref, acc) > -1 &&
1903        (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1904         return(true);
1905     if(fta_StringMatch(ddbj_tpa_accpref, acc) > -1 &&
1906        (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1907         return(true);
1908     return(false);
1909 }
1910 
1911 /**********************************************************/
IsWGSAccPrefix(const Parser & parseInfo,const char * acc)1912 static bool IsWGSAccPrefix(const Parser& parseInfo, const char* acc)
1913 {
1914     if(acc == NULL || StringLen(acc) != 2)
1915         return(false);
1916 
1917     if(fta_StringMatch(ncbi_wgs_accpref, acc) > -1 &&
1918        (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1919         return(true);
1920     if(fta_StringMatch(ddbj_wgs_accpref, acc) > -1 &&
1921        (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1922         return(true);
1923     return(false);
1924 }
1925 
1926 /**********************************************************/
IsTSAAccPrefix(const Parser & parseInfo,const char * acc,IndexblkPtr ibp)1927 static void IsTSAAccPrefix(const Parser& parseInfo, const char* acc, IndexblkPtr ibp)
1928 {
1929     if(acc == NULL || *acc == '\0')
1930         return;
1931 
1932     if(parseInfo.source == Parser::ESource::EMBL)
1933     {
1934         ibp->tsa_allowed = true;
1935         return;
1936     }
1937 
1938     if(acc[0] == 'U' && acc[1] == '\0' &&
1939        (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1940     {
1941         ibp->tsa_allowed = true;
1942         return;
1943     }
1944 
1945     if(StringLen(acc) != 2 && StringLen(acc) != 4)
1946         return;
1947 
1948     if(parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI)
1949     {
1950         if((StringLen(acc) == 2 &&
1951             (StringCmp(acc, "EZ") == 0 || StringCmp(acc, "HP") == 0 ||
1952              StringCmp(acc, "JI") == 0 || StringCmp(acc, "JL") == 0 ||
1953              StringCmp(acc, "JO") == 0 || StringCmp(acc, "JP") == 0 ||
1954              StringCmp(acc, "JR") == 0 || StringCmp(acc, "JT") == 0 ||
1955              StringCmp(acc, "JU") == 0 || StringCmp(acc, "JV") == 0 ||
1956              StringCmp(acc, "JW") == 0 || StringCmp(acc, "KA") == 0)) ||
1957            fta_if_wgs_acc(ibp->acnum) == 5)
1958         {
1959             ibp->is_tsa = true;
1960             ibp->tsa_allowed = true;
1961         }
1962         if(fta_StringMatch(acc_tsa_allowed, acc) > -1)
1963             ibp->tsa_allowed = true;
1964     }
1965 
1966     if(parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ)
1967     {
1968         if(StringNCmp(acc, "FX", 2) == 0 || StringNCmp(acc, "LA", 2) == 0 ||
1969            StringNCmp(acc, "LE", 2) == 0 || StringNCmp(acc, "LH", 2) == 0 ||
1970            StringNCmp(acc, "LI", 2) == 0 || StringNCmp(acc, "LJ", 2) == 0 ||
1971            fta_if_wgs_acc(ibp->acnum) == 8)
1972         {
1973             ibp->is_tsa = true;
1974             ibp->tsa_allowed = true;
1975         }
1976     }
1977 
1978     if(parseInfo.all == true || parseInfo.source == Parser::ESource::EMBL)
1979     {
1980         if(fta_if_wgs_acc(ibp->acnum) == 9)
1981         {
1982             ibp->is_tsa = true;
1983             ibp->tsa_allowed = true;
1984         }
1985     }
1986 }
1987 
1988 /**********************************************************/
IsTLSAccPrefix(const Parser & parseInfo,const char * acc,IndexblkPtr ibp)1989 static void IsTLSAccPrefix(const Parser& parseInfo, const char* acc, IndexblkPtr ibp)
1990 {
1991     if(acc == NULL || *acc == '\0' || StringLen(acc) != 4)
1992         return;
1993 
1994     if(parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI ||
1995        parseInfo.source == Parser::ESource::DDBJ)
1996         if(fta_if_wgs_acc(ibp->acnum) == 11)
1997             ibp->is_tls = true;
1998 }
1999 
sIsAccPrefixChar(char c)2000 static bool sIsAccPrefixChar(char c)  {
2001     return (c >= 'A'  && c <= 'Z');
2002 }
2003 /**********************************************************
2004  *
2005  *   bool GetAccession(pp, str, entry, skip):
2006  *
2007  *      Only record the first line of the first accession
2008  *   number.
2009  *      PIR format, accession number does not follow
2010  *   the rule.
2011  *
2012  *                                              3-4-93
2013  *
2014  **********************************************************/
2015 /*
2016 bool GetAccession(const Parser& parseInfo, const CTempString& str, IndexblkPtr entry, int skip)
2017 {
2018     string accession;
2019     list<string> tokens;
2020     bool get = true;
2021 
2022     if((skip != 2 && parseInfo.source == Parser::ESource::Flybase) ||
2023        parserInfo.source == Parser::ESource::USPTO)
2024         return true;
2025 
2026     NStr::Split(str, " ;", tokens, NStr::fSplit_Tokenize);
2027 
2028 
2029     if (skip != 2)
2030     {
2031         get = ParseAccessionRange(tokens, skip);
2032         if (get)
2033             get = sCheckAccession(tokens, parseInfo.source, parseInfo.mode, entry->acnum, skip);
2034         if (!get)
2035             entry->drop = 1;
2036 
2037         if (tokens.size()>skip && skip<2) { // Not sure about the logic
2038             auto it = skip ? next(tokens.begin(), skip) : tokens.begin();
2039             move(it, tokens.end(), entry->secondary_accessions.end());
2040         }
2041         return get;
2042     }
2043 
2044     // skip == 2
2045     entry->is_tpa = false;
2046     if(tokens.size() < 2)
2047     {
2048         if (parseInfo.mode != Parser::EMode::Relaxed) {
2049             ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
2050                     "No accession # for this entry, about line %ld",
2051                     (long int) entry->linenum);
2052             entry->drop = 1;
2053         }
2054         return false;
2055     }
2056 
2057 
2058     accession = *next(tokens.begin());
2059     sDelNonDigitTail(accession);
2060 
2061     StringCpy(entry->acnum, accession.c_str());
2062 
2063     if (parseInfo.format != Parser::EFormat::XML) {
2064         string temp = accession;
2065         if (parseInfo.accver && entry->vernum > 0) {
2066             temp += "." + NStr::NumericToString(entry->vernum);
2067         }
2068         if (temp.empty()) {
2069             if (entry->locusname[0] != '\0') {
2070                 temp = entry->locusname;
2071             }
2072             else {
2073                 temp = "???";
2074             }
2075         }
2076         FtaInstallPrefix(PREFIX_ACCESSION, temp.c_str(), NULL);
2077     }
2078 
2079     if (parseInfo.source == Parser::ESource::Flybase)
2080     {
2081         return true;
2082     }
2083 
2084     if (accession.size() < 2) {
2085         ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2086                   "Wrong accession [%s] for this entry.", accession.c_str());
2087         entry->drop = 1;
2088         return false;
2089     }
2090 
2091     if (sIsAccPrefixChar(accession[0]) && sIsAccPrefixChar(accession[1])) {
2092         if (parseInfo.accpref && !IsValidAccessPrefix(accession.c_str(), parseInfo.accpref)) {
2093             get = false;
2094         }
2095 
2096         if (sIsAccPrefixChar(accession[2]) && sIsAccPrefixChar(accession[3])) {
2097             if (sIsAccPrefixChar(accession[4])) {
2098                 accession = accession.substr(0,5);
2099             }
2100             else {
2101                 accession = accession.substr(0,4);
2102             }
2103         }
2104         else if (accession[2] == '_') {
2105             accession = accession.substr(0,3);
2106         }
2107         else {
2108             accession = accession.substr(0,2);
2109         }
2110     }
2111     else {
2112         if (parseInfo.acprefix && !StringChr(parseInfo.acprefix, accession[0])) {
2113             get = false;
2114         }
2115         accession = accession.substr(0,1);
2116     }
2117 
2118     if (get) {
2119         if (tokens.size() > 2) {
2120             get = ParseAccessionRange(tokens,2);
2121             if (get) {
2122                 get = sCheckAccession(tokens, parseInfo.source, parseInfo.mode, entry->acnum, 2);
2123             }
2124         }
2125     }
2126     else {
2127         string sourceName = sourceNames.at(parseInfo.source);
2128         ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2129                   "Wrong accession # prefix [%s] for this source: %s",
2130                   accession.c_str(), sourceName.c_str());
2131     }
2132 
2133     entry->secondary_accessions.clear(); // Is this necessary?
2134     move(next(tokens.begin(),2), tokens.end(), entry->secondary_accessions.begin());
2135 
2136     if (!entry->is_pat) {
2137         entry->is_pat = IsPatentedAccPrefix(parseInfo, accession.c_str());
2138     }
2139     entry->is_tpa = IsTPAAccPrefix(parseInfo, accession.c_str());
2140     entry->is_wgs = IsWGSAccPrefix(parseInfo, accession.c_str());
2141     IsTSAAccPrefix(parseInfo, accession.c_str(), entry);
2142     IsTLSAccPrefix(parseInfo, accession.c_str(), entry);
2143 
2144     auto i = IsNewAccessFormat(entry->acnum);
2145     if(i == 3 || i == 8)
2146     {
2147         entry->is_wgs = true;
2148         entry->wgs_and_gi |= 02;
2149     }
2150     else if(i == 5)
2151     {
2152         char* p = entry->acnum;
2153         if(parseInfo.source != Parser::ESource::DDBJ || *p != 'A' || StringLen(p) != 12 ||
2154            StringCmp(p + 5, "0000000") != 0)
2155         {
2156             string sourceName = sourceNames.at(parseInfo.source);
2157             ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2158                       "Wrong accession \"%s\" for this source: %s",
2159                       p, sourceName.c_str());
2160             get = false;
2161         }
2162         entry->is_mga = true;
2163     }
2164 
2165     if(!get)
2166         entry->drop = 1;
2167 
2168     return get;
2169 }
2170 */
2171 
2172 
GetAccession(ParserPtr pp,char * str,IndexblkPtr entry,Int4 skip)2173 bool GetAccession(ParserPtr pp, char* str, IndexblkPtr entry, Int4 skip)
2174 {
2175     Char            acc[200];
2176     Char            temp[400];
2177     char*         line;
2178     char*         p;
2179     TokenStatBlkPtr stoken;
2180     TokenBlkPtr     tbp;
2181     TokenBlkPtr     ttbp;
2182     bool            get = true;
2183     Int4            i;
2184 
2185     if((skip != 2 && pp->source == Parser::ESource::Flybase) ||
2186        pp->source == Parser::ESource::USPTO)
2187         return true;
2188 
2189     line = StringSave(str);
2190     for(p = line; *p != '\0'; p++)
2191         if(*p == ';')
2192             *p = ' ';
2193     stoken = TokenString(line, ' ');
2194 
2195     if(skip != 2)
2196     {
2197         get = ParseAccessionRange(stoken, skip);
2198         if(get)
2199             get = CheckAccession(stoken, pp->source, pp->mode, entry->acnum, skip);
2200         if(!get)
2201             entry->drop = 1;
2202 
2203         if(skip == 0)
2204         {
2205             tbp = stoken->list;
2206             stoken->list = NULL;
2207         }
2208         else if(skip == 1 && stoken->list != NULL)
2209         {
2210             tbp = stoken->list->next;
2211             stoken->list->next = NULL;
2212         }
2213         else
2214             tbp = NULL;
2215         if(tbp != NULL)
2216         {
2217             if(entry->secaccs == NULL)
2218                 entry->secaccs = tbp;
2219             else
2220             {
2221                 for(ttbp = entry->secaccs; ttbp->next != NULL;)
2222                     ttbp = ttbp->next;
2223                 ttbp->next = tbp;
2224             }
2225         }
2226 
2227         FreeTokenstatblk(stoken);
2228         MemFree(line);
2229         return(get);
2230     }
2231 
2232     entry->is_tpa = false;
2233     acc[0] = '\0';
2234     if(stoken->num < 2)
2235     {
2236         if (pp->mode != Parser::EMode::Relaxed) {
2237             ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
2238                     "No accession # for this entry, about line %ld",
2239                     (long int) entry->linenum);
2240             entry->drop = 1;
2241         }
2242         FreeTokenstatblk(stoken);
2243         MemFree(line);
2244         return false;
2245     }
2246 
2247     StringCpy(acc, stoken->list->next->str);    /* get first accession */
2248 
2249     if (pp->mode != Parser::EMode::Relaxed) {
2250         DelNoneDigitTail(acc);
2251     }
2252 
2253     StringCpy(entry->acnum, acc);
2254 
2255     if(pp->format != Parser::EFormat::XML)
2256     {
2257         if(pp->accver && entry->vernum > 0)
2258             sprintf(temp, "%s.%d", acc, entry->vernum);
2259         else
2260             StringCpy(temp, acc);
2261 
2262         if(*temp == '\0')
2263         {
2264             if(entry->locusname[0] != '\0')
2265                 StringCpy(temp, entry->locusname);
2266             else
2267                 StringCpy(temp, "???");
2268         }
2269         FtaInstallPrefix(PREFIX_ACCESSION, temp, NULL);
2270     }
2271 
2272     if(pp->source == Parser::ESource::Flybase)
2273     {
2274         FreeTokenstatblk(stoken);
2275         MemFree(line);
2276         return true;
2277     }
2278 
2279     if((StringLen(acc) < 2) &&
2280         pp->mode != Parser::EMode::Relaxed)
2281     {
2282         ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2283                   "Wrong accession [%s] for this entry.", acc);
2284         FreeTokenstatblk(stoken);
2285         entry->drop = 1;
2286         MemFree(line);
2287         return false;
2288     }
2289 
2290     if (pp->mode != Parser::EMode::Relaxed) {
2291         if(acc[0] >= 'A' && acc[0] <= 'Z' && acc[1] >= 'A' && acc[1] <= 'Z')
2292         {
2293             if(IsValidAccessPrefix(acc, pp->accpref) == false && pp->accpref != NULL)
2294                 get = false;
2295             if(acc[2] >= 'A' && acc[2] <= 'Z' && acc[3] >= 'A' && acc[3] <= 'Z')
2296             {
2297                 if(acc[4] >= 'A' && acc[4] <= 'Z') {
2298                     acc[5] = '\0';
2299                 }
2300                 else {
2301                     acc[4] = '\0';
2302                 }
2303             }
2304             else if(acc[2] == '_') {
2305                 acc[3] = '\0';
2306             }
2307             else {
2308                 acc[2] = '\0';
2309             }
2310         }
2311         else
2312         {
2313             /* Processing of accession numbers in old format
2314             */
2315             /* check valid prefix accession number
2316             */
2317             if(pp->acprefix != NULL && StringChr(pp->acprefix, *acc) == NULL)
2318                 get = false;
2319             acc[1] = '\0';
2320         }
2321     }
2322 
2323     if(get)
2324     {
2325         if (stoken->num > 2)
2326             get = ParseAccessionRange(stoken, 2);
2327         if (get) {
2328             get = CheckAccession(stoken, pp->source, pp->mode, entry->acnum, 2);
2329         }
2330     }
2331     else
2332     {
2333         string sourceName = sourceNames.at(pp->source);
2334         ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2335                   "Wrong accession # prefix [%s] for this source: %s",
2336                   acc, sourceName.c_str());
2337     }
2338 
2339     entry->secaccs = stoken->list->next->next;
2340     stoken->list->next->next = NULL;
2341 
2342     FreeTokenstatblk(stoken);
2343 
2344     if(!entry->is_pat)
2345         entry->is_pat = IsPatentedAccPrefix(*pp, acc);
2346     entry->is_tpa = IsTPAAccPrefix(*pp, acc);
2347     entry->is_wgs = IsWGSAccPrefix(*pp, acc);
2348     IsTSAAccPrefix(*pp, acc, entry);
2349     IsTLSAccPrefix(*pp, acc, entry);
2350 
2351     i = IsNewAccessFormat(entry->acnum);
2352     if(i == 3 || i == 8)
2353     {
2354         entry->is_wgs = true;
2355         entry->wgs_and_gi |= 02;
2356     }
2357     else if(i == 5)
2358     {
2359         p = entry->acnum;
2360         if(pp->source != Parser::ESource::DDBJ || *p != 'A' || StringLen(p) != 12 ||
2361            StringCmp(p + 5, "0000000") != 0)
2362         {
2363             string sourceName = sourceNames.at(pp->source);
2364             ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2365                       "Wrong accession \"%s\" for this source: %s",
2366                       p, sourceName.c_str());
2367             get = false;
2368         }
2369         entry->is_mga = true;
2370     }
2371 
2372     MemFree(line);
2373 
2374     if(!get)
2375         entry->drop = 1;
2376 
2377     return(get);
2378 }
2379 
2380 /**********************************************************/
ResetParserStruct(ParserPtr pp)2381 void ResetParserStruct(ParserPtr pp)
2382 {
2383     if(pp == NULL)
2384         return;
2385 
2386     if(pp->entrylist != NULL)
2387     {
2388         for(Int4 i = 0; i < pp->indx; i++)
2389             if(pp->entrylist[i] != NULL)
2390                 FreeIndexblk(pp->entrylist[i]);
2391 
2392         MemFree(pp->entrylist);
2393         pp->entrylist = NULL;
2394     }
2395 
2396     pp->indx = 0;
2397     pp->curindx = 0;
2398 
2399     if(pp->pbp != NULL)
2400     {
2401         if(pp->pbp->ibp != NULL)
2402             delete pp->pbp->ibp;
2403         delete pp->pbp;
2404         pp->pbp = NULL;
2405     }
2406 
2407 
2408     if(pp->operon != NULL)
2409     {
2410         fta_operon_free(pp->operon);
2411         pp->operon = NULL;
2412     }
2413 }
2414 
2415 /**********************************************************
2416  *
2417  *   void FreeParser(pp):
2418  *
2419  *                                              3-5-93
2420  *
2421  **********************************************************/
2422 /*
2423 void FreeParser(ParserPtr pp)
2424 {
2425     if(pp == NULL)
2426         return;
2427 
2428     ResetParserStruct(pp);
2429 
2430     if(pp->fpo != NULL)
2431         MemFree(pp->fpo);
2432     delete pp;
2433 }
2434 */
2435 
2436 /**********************************************************
2437  *
2438  *   void CloseFiles(pp):
2439  *
2440  *                                              3-4-93
2441  *
2442  **********************************************************/
CloseFiles(ParserPtr pp)2443 void CloseFiles(ParserPtr pp)
2444 {
2445     if(pp->qsfd != NULL)
2446     {
2447         fclose(pp->qsfd);
2448         pp->qsfd = NULL;
2449     }
2450 }
2451 
2452 /**********************************************************
2453  *
2454  *   void MsgSkipTitleFail(flatfile, finfo):
2455  *
2456  *                                              7-2-93
2457  *
2458  **********************************************************/
MsgSkipTitleFail(const char * flatfile,FinfoBlkPtr finfo)2459 void MsgSkipTitleFail(const char *flatfile, FinfoBlkPtr finfo)
2460 {
2461     ErrPostEx(SEV_ERROR, ERR_ENTRY_Begin,
2462               "No valid beginning of entry found in %s file", flatfile);
2463 
2464     MemFree(finfo);
2465 }
2466 
2467 
2468 /**********************************************************/
FindNextEntryBuf(bool end_of_file,FileBuf & fbuf,FinfoBlkPtr finfo,const char * str,Int2 len)2469 bool FindNextEntryBuf(bool end_of_file, FileBuf& fbuf, FinfoBlkPtr finfo, const char *str, Int2 len)
2470 {
2471     bool done = end_of_file;
2472     while (!done && StringNCmp(finfo->str, str, len) != 0)
2473         done = XReadFileBuf(fbuf, finfo);
2474 
2475     return(done);
2476 }
2477 
2478 
FindNextEntryBuf(bool end_of_file,FileBuf & fbuf,FinfoBlkPtr finfo,const CTempString & keyword)2479 bool FindNextEntryBuf(bool end_of_file, FileBuf& fbuf, FinfoBlkPtr finfo, const CTempString& keyword)
2480 {
2481     return FindNextEntryBuf(end_of_file, fbuf, finfo, keyword.data(), keyword.size());
2482 }
2483 
2484 
2485 /**********************************************************
2486  *
2487  *   bool FlatFileIndex(pp, (*fun)()):
2488  *
2489  *                                              10-6-93
2490  *
2491  **********************************************************/
FlatFileIndex(ParserPtr pp,void (* fun)(IndexblkPtr entry,char * offset,Int4 len))2492 bool FlatFileIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
2493 {
2494     bool index;
2495 
2496     switch(pp->format)
2497     {
2498         case Parser::EFormat::GenBank:
2499             index = GenBankIndex(pp);
2500             break;
2501         case Parser::EFormat::EMBL:
2502             index = EmblIndex(pp, fun);
2503             break;
2504         case Parser::EFormat::SPROT:
2505             index = SprotIndex(pp, fun);
2506             break;
2507         case Parser::EFormat::PRF:
2508             index = PrfIndex(pp, fun);
2509             break;
2510         case Parser::EFormat::PIR:
2511             index = PirIndex(pp, fun);
2512             break;
2513         case Parser::EFormat::XML:
2514             index = XMLIndex(pp);
2515             break;
2516         default:
2517             index = false;
2518             fprintf(stderr, "Unknown flatfile format.\n");
2519             break;
2520     }
2521     return(index);
2522 }
2523 
2524 /**********************************************************/
GetAccArray(Parser::ESource source)2525 const char **GetAccArray(Parser::ESource source)
2526 {
2527     if(source == Parser::ESource::EMBL)
2528         return(embl_accpref);
2529     if(source == Parser::ESource::PIR)
2530         return(pir_accpref);
2531     if(source == Parser::ESource::PRF)
2532         return(prf_accpref);
2533     if(source == Parser::ESource::SPROT)
2534         return(sprot_accpref);
2535     if(source == Parser::ESource::LANL)
2536         return(lanl_accpref);
2537     if(source == Parser::ESource::DDBJ)
2538         return(ddbj_accpref);
2539     if(source == Parser::ESource::NCBI)
2540         return(ncbi_accpref);
2541     if(source == Parser::ESource::Refseq)
2542         return(refseq_accpref);
2543     return(NULL);
2544 }
2545 
2546 /**********************************************************/
GetNucAccOwner(const char * acc,bool is_tpa)2547 CSeq_id::E_Choice GetNucAccOwner(const char* acc, bool is_tpa)
2548 {
2549     Char    p[4];
2550     const char*q;
2551 
2552     if(acc == NULL)
2553         return objects::CSeq_id::e_not_set;
2554 
2555     size_t len = StringLen(acc);
2556 
2557     if(len > 8 && acc[2] == '_')
2558     {
2559         p[0] = acc[0];
2560         p[1] = acc[1];
2561         p[2] = acc[2];
2562         p[3] = '\0';
2563         if(MatchArrayString(refseq_accpref, p) > -1)
2564         {
2565             for(q = acc + 3; *q != '\0'; q++)
2566             {
2567                 if(*q >= '0' && *q <= '9')
2568                     continue;
2569                 break;
2570             }
2571             if(*q == '\0')
2572             {
2573                 return(objects::CSeq_id::e_Other);
2574             }
2575         }
2576     }
2577 
2578     if(len != 6 && (len < 8 || len > 17))
2579         return objects::CSeq_id::e_not_set;
2580 
2581     if(len == 11)
2582     {
2583         if(acc[0] == 'N' && acc[1] == 'Z' && acc[2] == '_' &&
2584            acc[3] >= 'A' && acc[3] <= 'Z' && acc[4] >= 'A' && acc[4] <= 'Z')
2585         {
2586             for(q = acc + 5; *q != '\0'; q++)
2587                 if(*q < '0' || *q > '9')
2588                     break;
2589             if(*q == '\0')
2590             {
2591                 return objects::CSeq_id::e_Other;
2592             }
2593         }
2594         return objects::CSeq_id::e_not_set;
2595     }
2596 
2597     if(len == 6)
2598     {
2599         if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < '0' || acc[1] > '9' ||
2600            acc[2] < '0' || acc[2] > '9' || acc[3] < '0' || acc[3] > '9' ||
2601            acc[4] < '0' || acc[4] > '9' || acc[5] < '0' || acc[5] > '9')
2602            return objects::CSeq_id::e_not_set;
2603 
2604         if(StringChr(ParFlat_NCBI_AC, acc[0]) != NULL)
2605         {
2606             return objects::CSeq_id::e_Genbank;
2607         }
2608         if(StringChr(ParFlat_LANL_AC, acc[0]) != NULL)
2609         {
2610             return objects::CSeq_id::e_Genbank;
2611         }
2612         if(StringChr(ParFlat_DDBJ_AC, acc[0]) != NULL)
2613         {
2614             return objects::CSeq_id::e_Ddbj;
2615         }
2616         if(StringChr(ParFlat_EMBL_AC, acc[0]) != NULL)
2617         {
2618             if (!is_tpa)
2619                 return objects::CSeq_id::e_Embl;
2620             return objects::CSeq_id::e_Tpe;
2621         }
2622         return objects::CSeq_id::e_not_set;
2623     }
2624 
2625     if(len > 11 && len < 16 && acc[2] != '_')
2626     {
2627         if(len == 12 && acc[0] == 'A' && acc[1] >= 'A' && acc[1] <= 'Z' &&
2628            acc[2] >= 'A' && acc[2] <= 'Z' && acc[3] >= 'A' &&
2629            acc[3] <= 'Z' && acc[4] >= 'A' && acc[4] <= 'Z' &&
2630            StringCmp(acc + 5, "0000000") == 0)
2631         {
2632             return(objects::CSeq_id::e_Ddbj);
2633         }
2634 
2635         if(((acc[0] < 'A' || acc[0] > 'S') &&
2636             (acc[0] < 'T' || acc[0] > 'W')) ||
2637            acc[1] < 'A' || acc[1] > 'Z' || acc[2] < 'A' || acc[2] > 'Z' ||
2638            acc[3] < 'A' || acc[3] > 'Z' || acc[4] < '0' || acc[4] > '9' ||
2639            acc[5] < '0' || acc[5] > '9' ||
2640            ((acc[6] < '0' || acc[6] > '9') && acc[6] != 'S') ||
2641            acc[7] < '0' || acc[7] > '9' ||
2642            acc[8] < '0' || acc[8] > '9' || acc[9] < '0' || acc[9] > '9' ||
2643            acc[10] < '0' || acc[10] > '9' || acc[11] < '0' || acc[11] > '9')
2644         {
2645             if(len != 15)
2646                 return objects::CSeq_id::e_not_set;
2647         }
2648 
2649         if(len == 12 && acc[6] == 'S')
2650             return objects::CSeq_id::e_not_set;
2651         if(len == 15 && acc[6] != 'S' && acc[5] >= '0' && acc[5] <= '9')
2652             return objects::CSeq_id::e_not_set;
2653 
2654         if(len > 12 && (acc[12] < '0' || acc[12] > '9'))
2655             return objects::CSeq_id::e_not_set;
2656         if (len > 13 && (acc[13] < '0' || acc[13] > '9'))
2657             return objects::CSeq_id::e_not_set;
2658         if (len > 14 && (acc[14] < '0' || acc[14] > '9'))
2659             return objects::CSeq_id::e_not_set;
2660 
2661         if(acc[0] == 'A' || acc[0] == 'D' || acc[0] == 'G' ||
2662            (acc[0] > 'I' && acc[0] < 'O') || (acc[0] > 'O' && acc[0] < 'T') ||
2663            acc[0] == 'V' || acc[0] == 'W')
2664         {
2665             if(acc[0] == 'D')
2666                 return objects::CSeq_id::e_Tpg;
2667             return objects::CSeq_id::e_Genbank;
2668         }
2669         if(acc[0] == 'B' || acc[0] == 'E' || acc[0] == 'I' || acc[0] == 'T')
2670         {
2671             if(acc[0] == 'E')
2672                 return objects::CSeq_id::e_Tpd;
2673             return objects::CSeq_id::e_Ddbj;
2674         }
2675         if(acc[0] == 'C' || acc[0] == 'F' || acc[0] == 'O' || acc[0] == 'H' ||
2676            acc[0] == 'U')
2677         {
2678             if (!is_tpa)
2679                 return objects::CSeq_id::e_Embl;
2680             return objects::CSeq_id::e_Tpe;
2681         }
2682         if(len != 15)
2683             return objects::CSeq_id::e_not_set;
2684     }
2685 
2686     if(len > 14 && len < 18)
2687     {
2688         if(acc[2] == '_')
2689         {
2690             if(acc[0] != 'N' || acc[1] != 'Z' || acc[2] != '_' ||
2691                ((acc[3] < 'A' || acc[3] > 'J') &&
2692                 (acc[3] < 'L' || acc[3] > 'N')) ||
2693                acc[4] < 'A' || acc[4] > 'Z' || acc[5] < 'A' || acc[5] > 'Z' ||
2694                acc[6] < 'A' || acc[6] > 'Z' || acc[7] < '0' || acc[7] > '9' ||
2695                acc[8] < '0' || acc[8] > '9' || acc[9] < '0' || acc[9] > '9' ||
2696                acc[10] < '0' || acc[10] > '9' || acc[11] < '0' ||
2697                acc[11] > '9' || acc[12] < '0' || acc[12] > '9' ||
2698                acc[13] < '0' || acc[13] > '9' || acc[14] < '0' || acc[14] > '9')
2699                return objects::CSeq_id::e_not_set;
2700 
2701             if(len > 15 && (acc[15] < '0' || acc[15] > '9'))
2702                 return objects::CSeq_id::e_not_set;
2703             if(len > 16 && (acc[16] < '0' || acc[16] > '9'))
2704                 return objects::CSeq_id::e_not_set;
2705             return objects::CSeq_id::e_Other;
2706         }
2707         if((acc[0] != 'A' && acc[0] != 'B' && acc[0] != 'C') ||
2708            acc[1] < 'A' || acc[1] > 'Z' || acc[2] < 'A' || acc[2] > 'Z' ||
2709            acc[3] < 'A' || acc[3] > 'Z' || acc[4] < 'A' || acc[4] > 'Z' ||
2710            acc[5] < 'A' || acc[5] > 'Z' || acc[6] < '0' || acc[6] > '9' ||
2711            acc[7] < '0' || acc[7] > '9' || acc[8] < '0' || acc[8] > '9' ||
2712            acc[9] < '0' || acc[9] > '9' || acc[10] < '0' || acc[10] > '9' ||
2713            acc[11] < '0' || acc[11] > '9' || acc[12] < '0' || acc[12] > '9' ||
2714            acc[13] < '0' || acc[13] > '9' || acc[14] < '0' || acc[14] > '9')
2715             return objects::CSeq_id::e_not_set;
2716 
2717         if(len > 15 && (acc[15] < '0' || acc[15] > '9'))
2718             return objects::CSeq_id::e_not_set;
2719         if(len > 16 && (acc[16] < '0' || acc[16] > '9'))
2720             return objects::CSeq_id::e_not_set;
2721 
2722         if(acc[0] == 'A')
2723         {
2724             return objects::CSeq_id::e_Genbank;
2725         }
2726         if(acc[0] == 'B')
2727         {
2728             return objects::CSeq_id::e_Ddbj;
2729         }
2730         if(acc[0] == 'C')
2731         {
2732             return objects::CSeq_id::e_Embl;
2733         }
2734         return objects::CSeq_id::e_not_set;
2735     }
2736 
2737     q = acc + ((len == 8 || len == 10) ? 2 : 3);
2738     if(q[0] < '0' || q[0] > '9' || q[1] < '0' || q[1] > '9' ||
2739        q[2] < '0' || q[2] > '9' || q[3] < '0' || q[3] > '9' ||
2740        q[4] < '0' || q[4] > '9' || q[5] < '0' || q[5] > '9')
2741        return objects::CSeq_id::e_not_set;
2742 
2743     if(len == 9)
2744     {
2745         p[0] = acc[0];
2746         p[1] = acc[1];
2747         p[2] = acc[2];
2748         p[3] = '\0';
2749         if(MatchArrayString(refseq_accpref, p) > -1)
2750         {
2751             return objects::CSeq_id::e_Other;
2752         }
2753         return objects::CSeq_id::e_not_set;
2754     }
2755 
2756     if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < 'A' || acc[1] > 'Z')
2757         return objects::CSeq_id::e_not_set;
2758 
2759     p[0] = acc[0];
2760     p[1] = acc[1];
2761     p[2] = '\0';
2762     if(MatchArrayString(ncbi_accpref, p) > -1)
2763     {
2764         if(MatchArrayString(ncbi_tpa_accpref, p) > -1)
2765             return objects::CSeq_id::e_Tpg;
2766         return objects::CSeq_id::e_Genbank;
2767     }
2768     if(MatchArrayString(lanl_accpref, p) > -1)
2769     {
2770         return objects::CSeq_id::e_Genbank;
2771     }
2772     if(MatchArrayString(ddbj_accpref, p) > -1)
2773     {
2774         if(MatchArrayString(ddbj_tpa_accpref, p) > -1)
2775             return objects::CSeq_id::e_Tpd;
2776         return objects::CSeq_id::e_Ddbj;
2777     }
2778     if(MatchArrayString(embl_accpref, p) > -1)
2779     {
2780         if (!is_tpa)
2781             return objects::CSeq_id::e_Embl;
2782         return objects::CSeq_id::e_Tpe;
2783     }
2784 
2785     return objects::CSeq_id::e_not_set;
2786 }
2787 
2788 
2789 
2790 /**********************************************************/
GetProtAccOwner(const Char * acc)2791 Uint1 GetProtAccOwner(const Char* acc)
2792 {
2793     const Char* q;
2794     Char    p[4];
2795 
2796     if(acc == NULL)
2797         return(0);
2798 
2799     size_t len = StringLen(acc);
2800     if(len == 9 || len == 12)
2801     {
2802         p[0] = acc[0];
2803         p[1] = acc[1];
2804         p[2] = acc[2];
2805         p[3] = '\0';
2806         if(MatchArrayString(refseq_prot_accpref, p) > -1)
2807         {
2808             for(q = &acc[3]; *q >= '0' && *q <= '9';)
2809                 q++;
2810             if(*q == '\0')
2811                 return objects::CSeq_id::e_Other;
2812         }
2813         return(0);
2814     }
2815 
2816     if(len != 8 && len != 10)
2817         return(0);
2818 
2819     if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < 'A' || acc[1] > 'Z' ||
2820        acc[2] < 'A' || acc[2] > 'Z' || acc[3] < '0' || acc[3] > '9' ||
2821        acc[4] < '0' || acc[4] > '9' || acc[5] < '0' || acc[5] > '9' ||
2822        acc[6] < '0' || acc[6] > '9' || acc[7] < '0' || acc[7] > '9')
2823     {
2824         if(len == 8)
2825             return(0);
2826         if(acc[8] < '0' || acc[8] > '9' || acc[9] < '0' || acc[9] > '9')
2827             return(0);
2828     }
2829 
2830     if(acc[0] == 'D' || acc[0] == 'H')
2831         return objects::CSeq_id::e_Tpg;
2832     if(acc[0] == 'F' || acc[0] == 'I')
2833         return objects::CSeq_id::e_Tpd;
2834     if(acc[0] == 'A' || acc[0] == 'E' || acc[0] == 'J' || acc[0] == 'K' ||
2835        (acc[0] > 'L' && acc[0] < 'S') || acc[0] == 'T' || acc[0] == 'U')
2836        return objects::CSeq_id::e_Genbank;
2837     if(acc[0] == 'B' || acc[0] == 'G' || acc[0] == 'L')
2838         return objects::CSeq_id::e_Ddbj;
2839     if(acc[0] == 'C' || acc[0] == 'S' || acc[0] == 'V')
2840         return objects::CSeq_id::e_Embl;
2841 
2842     return(0);
2843 }
2844 
2845 END_NCBI_SCOPE
2846