1 /* indx_blk.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: indx_blk.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Common for all format functions.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39
40 #include "index.h"
41 #include <objtools/flatfile/flatfile_parse_info.hpp>
42
43 #include "ftaerr.hpp"
44 #include "indx_blk.h"
45 #include "indx_def.h"
46 #include "utilfun.h"
47 #include <map>
48
49 #ifdef THIS_FILE
50 # undef THIS_FILE
51 #endif
52 #define THIS_FILE "indx_blk.cpp"
53
54
55 BEGIN_NCBI_SCOPE
56 USING_SCOPE(objects);
57 static const char *XML_STRAND_array[] = {
58 " ", "single", "double", "mixed", NULL
59 };
60
61 static const char *XML_TPG_array[] = {
62 " ", "Linear", "Circular", "Tandem", NULL
63 };
64
65 static const char *ParFlat_STRAND_array[] = {
66 " ", "ss-", "ds-", "ms-", NULL
67 };
68
69 static const char *ParFlat_TPG_array[] = {
70 " ", "Linear ", "Circular ", "Tandem ", NULL
71 };
72
73 static const char *ParFlat_NA_array_DDBJ[] = {
74 "cDNA", NULL
75 };
76
77 static const char *ParFlat_AA_array_DDBJ[] = {
78 "PRT", NULL
79 };
80
81 static const char *ParFlat_NA_array[] = {
82 " ", "NA", "DNA", "genomic DNA", "other DNA", "unassigned DNA", "RNA",
83 "mRNA", "rRNA", "tRNA", "uRNA", "scRNA", "snRNA", "snoRNA", "pre-RNA",
84 "pre-mRNA", "genomic RNA", "other RNA", "unassigned RNA", "cRNA",
85 "viral cRNA", NULL
86 };
87
88 static const char *ParFlat_DIV_array[] = {
89 " ", "PRI", "ROD", "MAM", "VRT", "INV", "PLN", "BCT", "RNA",
90 "VRL", "PHG", "SYN", "UNA", "EST", "PAT", "STS", "ORG", "GSS",
91 "HUM", "HTG", "CON", "HTC", "ENV", "TSA", NULL
92 };
93
94 static const char *embl_accpref[] = {
95 "AJ", "AL", "AM", "AN", "AX", "BN", "BX", "CQ", "CR", "CS", "CT", "CU",
96 "FB", "FM", "FN", "FO", "FP", "FQ", "FR", "GM", "GN", "HA", "HB", "HC",
97 "HD", "HE", "HF", "HG", "HH", "HI", "JA", "JB", "JC", "JD", "JE", "LK",
98 "LL", "LM", "LN", "LO", "LP", "LQ", "LR", "LS", "LT", "MP", "MQ", "MR",
99 "MS", "OA", "OB", "OC", "OD", "OE", NULL
100 };
101
102 static const char *lanl_accpref[] = {
103 "AD", NULL
104 };
105
106 static const char *pir_accpref[] = {
107 "CC", NULL
108 };
109
110 static const char *prf_accpref[] = {
111 "XX", NULL
112 };
113
114 static const char *sprot_accpref[] = {
115 "DD", NULL
116 };
117
118 static const char *ddbj_accpref[] = {
119 "AB", "AG", "AK", "AP", "AT", "AU", "AV", "BA", "BB", "BD", "BJ", "BP",
120 "BR", "BS", "BW", "BY", "CI", "CJ", "DA", "DB", "DC", "DD", "DE", "DF",
121 "DG", "DH", "DI", "DJ", "DK", "DL", "DM", "FS", "FT", "FU", "FV", "FW",
122 "FX", "FY", "FZ", "GA", "GB", "HT", "HU", "HV", "HW", "HX", "HY", "HZ",
123 "LA", "LB", "LC", "LD", "LE", "LF", "LG", "LH", "LI", "LJ", "LU", "LV",
124 "LX", "LY", "LZ", "MA", "MB", "MC", "MD", "ME", "OF", "OG", NULL
125 };
126
127 static const char *ncbi_accpref[] = {
128 "AA", "AC", "AD", "AE", "AF", "AH", "AI", "AQ", "AR", "AS", "AW", "AY",
129 "AZ", "BC", "BE", "BF", "BG", "BH", "BI", "BK", "BL", "BM", "BQ", "BT",
130 "BU", "BV", "BZ", "CA", "CB", "CC", "CD", "CE", "CF", "CG", "CH", "CK",
131 "CL", "CM", "CN", "CO", "CP", "CV", "CW", "CX", "CY", "CZ", "DN", "DP",
132 "DQ", "DR", "DS", "DT", "DU", "DV", "DW", "DX", "DY", "DZ", "EA", "EB",
133 "EC", "ED", "EE", "EF", "EG", "EH", "EI", "EJ", "EK", "EL", "EM", "EN",
134 "EP", "EQ", "ER", "ES", "ET", "EU", "EV", "EW", "EX", "EY", "EZ", "FA",
135 "FC", "FD", "FE", "FF", "FG", "FH", "FI", "FJ", "FK", "FL", "GC", "GD",
136 "GE", "GF", "GG", "GH", "GJ", "GK", "GL", "GO", "GP", "GQ", "GR", "GS",
137 "GT", "GU", "GV", "GW", "GX", "GY", "GZ", "HJ", "HK", "HL", "HM", "HN",
138 "HO", "HP", "HQ", "HR", "HS", "JF", "JG", "JH", "JI", "JJ", "JK", "JL",
139 "JM", "JN", "JO", "JP", "JQ", "JR", "JS", "JT", "JU", "JV", "JW", "JX",
140 "JY", "JZ", "KA", "KB", "KC", "KD", "KE", "KF", "KG", "KH", "KI", "KJ",
141 "KK", "KL", "KM", "KN", "KO", "KP", "KQ", "KR", "KS", "KT", "KU", "KV",
142 "KX", "KY", "KZ", "MF", "MG", "MH", "MI", "MJ", "MK", "ML", "MM", "MN",
143 "MO", "MT", "MU", NULL
144 };
145
146 static const char *refseq_accpref[] = {
147 "NC_", "NG_", "NM_", "NP_", "NR_", "NT_", "NW_", "XM_", "XP_", "XR_",
148 "NZ_", NULL
149 };
150
151 static const char *refseq_prot_accpref[] = {
152 "AP_", "NP_", "WP_", "XP_", "YP_", "ZP_", NULL
153 };
154
155 static const char *acc_tsa_allowed[] = {
156 "AF", "AY", "DQ", "EF", "EU", "FJ", "GQ", "HQ", "JF", "JN", "JQ", "JX",
157 "KC", "KF", "KJ", "KM", "KP", "KR", "KT", "KU", "KX", "KY", "MF", "MG",
158 "MH", "MK", "MN", "MT", NULL
159 };
160
161 static const char *ncbi_tpa_accpref[] = {
162 "BK", "BL", "GJ", "GK", NULL
163 };
164
165 static const char *ddbj_tpa_accpref[] = {
166 "BR", "HT", "HU", NULL
167 };
168
169 static const char *ncbi_wgs_accpref[] = {
170 "GJ", "GK", NULL
171 };
172
173 static const char *ddbj_wgs_accpref[] = {
174 "HT", "HU", NULL
175 };
176
177 static const set<string> k_WgsScaffoldPrefix =
178 {"CH", "CT", "CU", "DF", "DG", "DS",
179 "EM", "EN", "EP", "EQ", "FA", "FM",
180 "GG", "GJ", "GK", "GL", "HT", "HU",
181 "JH", "KB", "KD", "KE", "KI", "KK",
182 "KL", "KN", "KQ", "KV", "KZ", "LD",
183 "ML", "MU"};
184
185 //static const char *wgs_scfld_pref[] =
186
187 static const char *source[11] = {
188 "unknown",
189 "EMBL",
190 "GENBANK",
191 "PIR",
192 "Swiss-Prot",
193 "NCBI",
194 "GSDB",
195 "DDBJ",
196 "FlyBase",
197 "RefSeq",
198 "unknown"
199 };
200
201
202 static const map<Parser::ESource, string> sourceNames = {
203 {Parser::ESource::unknown, "unknown"},
204 {Parser::ESource::EMBL, "EMBL"},
205 {Parser::ESource::GenBank, "GENBANK"},
206 {Parser::ESource::PIR, "PIR"},
207 {Parser::ESource::SPROT, "Swiss-Prot"},
208 {Parser::ESource::NCBI, "NCBI"},
209 {Parser::ESource::LANL, "GSDB"},
210 {Parser::ESource::Flybase, "FlyBase"},
211 {Parser::ESource::Refseq, "RefSeq"},
212 {Parser::ESource::PRF, "unknown"}};
213
214 static const char *month_name[] = {
215 "Ill", "JAN", "FEB", "MAR", "APR", "MAY", "JUN",
216 "JUL", "AUG", "SEP", "OCT", "NOV", "DEC", NULL
217 };
218
219 static const char *ParFlat_RESIDUE_STR[] = {
220 "bp", "bp.", "bp,", "AA", "AA.", "AA,", NULL
221 };
222
223 static const char *ValidMolTypes[] = {
224 "genomic DNA",
225 "genomic RNA",
226 "mRNA",
227 "tRNA",
228 "rRNA",
229 "snoRNA",
230 "snRNA",
231 "scRNA",
232 "pre-RNA",
233 "pre-mRNA",
234 "other RNA",
235 "other DNA",
236 "transcribed RNA",
237 "unassigned RNA",
238 "unassigned DNA",
239 "viral cRNA",
240 NULL
241 };
242
243 // functions below are implemented in different source files
244 bool EmblIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len));
245 bool GenBankIndex(ParserPtr pp);
246 bool SprotIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char* offset, Int4 len));
247 bool PrfIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char* offset, Int4 len));
248 bool PirIndex(ParserPtr pp, void(*fun)(IndexblkPtr entry, char* offset, Int4 len));
249 bool XMLIndex(ParserPtr pp);
250
251 /**********************************************************
252 *
253 * static char* GetResidue(stoken):
254 *
255 * Return a string pointer in the "stoken" which its
256 * next token string match any one string in the
257 * ParFlat_RESIDUE_STR but ignore case for all alphabetic
258 * characters; return NULL if not found.
259 *
260 * 3-25-93
261 *
262 **********************************************************/
GetResidue(TokenStatBlkPtr stoken)263 static char* GetResidue(TokenStatBlkPtr stoken)
264 {
265 TokenBlkPtr sptr;
266 TokenBlkPtr ptr;
267 const char **b;
268 Int2 i;
269
270 ptr = stoken->list;
271 sptr = stoken->list->next;
272 for(i = 1; i < stoken->num; i++, ptr = ptr->next, sptr = sptr->next)
273 {
274 for(b = ParFlat_RESIDUE_STR; *b != NULL; b++)
275 if(StringICmp(*b, sptr->str) == 0)
276 return(ptr->str);
277 }
278
279 return(NULL);
280 }
281
282 /**********************************************************
283 *
284 * bool XReadFile(fp, finfo):
285 *
286 * Record position and line # of the file, loop stop
287 * when got a none blank line.
288 * Return TRUE if END_OF_FILE.
289 *
290 * 2-26-93
291 *
292 **********************************************************/
XReadFile(FILE * fp,FinfoBlkPtr finfo)293 bool XReadFile(FILE* fp, FinfoBlkPtr finfo)
294 {
295 bool end_of_file = false;
296
297 StringCpy(finfo->str, "\n");
298 while(!end_of_file && StringNCmp(finfo->str, "\n", 1) == 0)
299 {
300 finfo->pos = (size_t) ftell(fp);
301 if (fgets(finfo->str, sizeof(finfo->str) - 1, fp) == NULL)
302 end_of_file = true;
303 else
304 ++(finfo->line);
305 }
306
307 auto n = strlen(finfo->str);
308 while (n) {
309 n--;
310 if (finfo->str[n] != '\n' && finfo->str[n] != '\r') {
311 break;
312 }
313 finfo->str[n] = 0;
314 }
315
316 return(end_of_file);
317 }
318
319 /**********************************************************/
FileGetsBuf(char * res,Int4 size,FileBuf & fbuf)320 static Int2 FileGetsBuf(char* res, Int4 size, FileBuf& fbuf)
321 {
322 const char* p = nullptr;
323 char* q;
324 Int4 l;
325 Int4 i;
326
327 if(*fbuf.current == '\0')
328 return(0);
329
330 l = size - 1;
331 for(p = fbuf.current, q = res, i = 0; i < l; i++, p++)
332 {
333 *q++ = *p;
334 if(*p == '\n' || *p == '\r')
335 {
336 p++;
337 break;
338 }
339 }
340
341 *q = '\0';
342 fbuf.current = p;
343 return(1);
344 }
345
346 /**********************************************************/
XReadFileBuf(FileBuf & fbuf,FinfoBlkPtr finfo)347 bool XReadFileBuf(FileBuf& fbuf, FinfoBlkPtr finfo)
348 {
349 bool end_of_file = false;
350
351 StringCpy(finfo->str, "\n");
352 while(!end_of_file && StringNCmp(finfo->str, "\n", 1) == 0)
353 {
354 finfo->pos = (size_t) (fbuf.current - fbuf.start);
355 if(FileGetsBuf(finfo->str, sizeof(finfo->str) - 1, fbuf) == 0)
356 end_of_file = true;
357 else
358 ++(finfo->line);
359 }
360
361 return(end_of_file);
362 }
363
364 /**********************************************************
365 *
366 * bool SkipTitle(fp, finfo, str, len):
367 *
368 * Return TRUE if file contains no entry in which no
369 * match in keyword "str".
370 * Skip any title declaration lines.
371 *
372 * 3-5-93
373 *
374 **********************************************************/
SkipTitle(FILE * fp,FinfoBlkPtr finfo,const char * str,Int2 len)375 bool SkipTitle(FILE* fp, FinfoBlkPtr finfo, const char *str, Int2 len)
376 {
377 bool end_of_file = XReadFile(fp, finfo);
378 while(!end_of_file && StringNCmp(finfo->str, str, len) != 0)
379 end_of_file = XReadFile(fp, finfo);
380
381 return(end_of_file);
382 }
383
384
SkipTitle(FILE * fp,FinfoBlkPtr finfo,const CTempString & keyword)385 bool SkipTitle(FILE* fp, FinfoBlkPtr finfo, const CTempString& keyword)
386 {
387 return SkipTitle(fp, finfo, keyword.data(), keyword.size());
388 }
389
390 /**********************************************************/
SkipTitleBuf(FileBuf & fbuf,FinfoBlkPtr finfo,const char * str,Int2 len)391 bool SkipTitleBuf(FileBuf& fbuf, FinfoBlkPtr finfo, const char *str, Int2 len)
392 {
393 bool end_of_file = XReadFileBuf(fbuf, finfo);
394 while(!end_of_file && StringNCmp(finfo->str, str, len) != 0)
395 end_of_file = XReadFileBuf(fbuf, finfo);
396
397 return(end_of_file);
398 }
399
400
SkipTitleBuf(FileBuf & fbuf,FinfoBlkPtr finfo,const CTempString & keyword)401 bool SkipTitleBuf(FileBuf& fbuf, FinfoBlkPtr finfo, const CTempString& keyword)
402 {
403 return SkipTitleBuf(fbuf, finfo, keyword.data(), keyword.size());
404 }
405
406
407 /**********************************************************
408 *
409 * static bool CheckLocus(locus):
410 *
411 * Locus name only allow A-Z, 0-9, characters,
412 * reject if not.
413 *
414 **********************************************************/
CheckLocus(char * locus,Parser::ESource source)415 static bool CheckLocus(char* locus, Parser::ESource source)
416 {
417 char* p = locus;
418 if(StringNCmp(locus, "SEG_", 4) == 0 &&
419 (source == Parser::ESource::NCBI || source == Parser::ESource::DDBJ))
420 p += 4;
421 for(; *p != '\0'; p++)
422 {
423 if((*p >= '0' && *p <= '9') || (*p >= 'A' && *p <= 'Z') ||
424 (*p == '.' && source == Parser::ESource::Flybase))
425 continue;
426 if(((*p >= 'a' && *p <= 'z') || *p == '_' || *p == '-' || *p == '(' ||
427 *p == ')' || *p == '/') && source == Parser::ESource::Refseq)
428 continue;
429
430 ErrPostEx(SEV_ERROR, ERR_LOCUS_BadLocusName,
431 "Bad locusname, <%s> for this entry", locus);
432 break;
433 }
434
435 return (*p != '\0');
436 }
437
438 /**********************************************************
439 *
440 * static bool CheckLocusSP(locus):
441 *
442 * Locus name consists of up tp 10 uppercase
443 * alphanumeric characters.
444 * Rule: X_Y format (SWISS-PROT), reject if not
445 * - X is a mnemonic code, up to 4 alphanumeric
446 * characters to represent the protein name.
447 * - Y is a mnemonic species identification code of
448 * at most 5 alphanumeric characters to representing
449 * the biological source of the protein.
450 * Checking the defined species identification code
451 * has not been implemented.
452 *
453 * Example: RL1_ECOLI FER_HALHA
454 *
455 **********************************************************/
CheckLocusSP(char * locus)456 static bool CheckLocusSP(char* locus)
457 {
458 char* p;
459 bool underscore = false;
460 Int2 x;
461 Int2 y;
462
463 for(p = locus, x = y = 0; *p != '\0'; p++)
464 {
465 if((*p >= '0' && *p <= '9') || (*p >= 'A' && *p <= 'Z'))
466 {
467 if (!underscore)
468 x++;
469 else
470 y++;
471 }
472 else if(*p == '_')
473 underscore = true;
474 else
475 break;
476 }
477
478 if(*p != '\0' || x == 0 || y == 0)
479 {
480 ErrPostEx(SEV_ERROR, ERR_LOCUS_BadLocusName,
481 "Bad locusname, <%s> for this entry", locus);
482 return true;
483 }
484
485 return false;
486 }
487
488 /**********************************************************
489 *
490 * static bool CkDateFormat(date):
491 *
492 * Return FALSE if date != dd-mmm-yyyy format.
493 *
494 **********************************************************/
CkDateFormat(char * date)495 static bool CkDateFormat(char* date)
496 {
497 if(date[2] == '-' && date[6] == '-' &&
498 IS_DIGIT(date[0]) != 0 && IS_DIGIT(date[1]) != 0 &&
499 IS_DIGIT(date[7]) != 0 && IS_DIGIT(date[8]) != 0 &&
500 IS_DIGIT(date[9]) != 0 && IS_DIGIT(date[10]) != 0 &&
501 MatchArraySubString(month_name, date) != -1)
502 return true;
503
504 return false;
505 }
506
507 /**********************************************************/
CheckSTRAND(const char * str)508 Int2 CheckSTRAND(const char* str)
509 {
510 return(fta_StringMatch(ParFlat_STRAND_array, str));
511 }
512
513 /**********************************************************/
XMLCheckSTRAND(char * str)514 Int2 XMLCheckSTRAND(char* str)
515 {
516 return(StringMatchIcase(XML_STRAND_array, str));
517 }
518
519 /**********************************************************/
XMLCheckTPG(char * str)520 Int2 XMLCheckTPG(char* str)
521 {
522 Int2 i;
523
524 i = StringMatchIcase(XML_TPG_array, str);
525 if(i == 0)
526 i++;
527 return(i);
528 }
529
530 /**********************************************************/
CheckTPG(char * str)531 Int2 CheckTPG(char* str)
532 {
533 return(StringMatchIcase(ParFlat_TPG_array, str));
534 }
535
536 /**********************************************************/
CheckNADDBJ(char * str)537 Int2 CheckNADDBJ(char* str)
538 {
539 return(fta_StringMatch(ParFlat_NA_array_DDBJ, str));
540 }
541
542 /**********************************************************/
CheckNA(char * str)543 Int2 CheckNA(char* str)
544 {
545 return(fta_StringMatch(ParFlat_NA_array, str));
546 }
547
548 /**********************************************************/
CheckDIV(char * str)549 Int2 CheckDIV(char* str)
550 {
551 return(fta_StringMatch(ParFlat_DIV_array, str));
552 }
553
554 /**********************************************************/
CkLocusLinePos(char * offset,Parser::ESource source,LocusContPtr lcp,bool is_mga)555 bool CkLocusLinePos(char* offset, Parser::ESource source, LocusContPtr lcp, bool is_mga)
556 {
557 Char date[12];
558 bool ret = true;
559 char* p;
560 Int4 i;
561
562 p = StringChr(offset, '\n');
563 if(p != NULL)
564 *p = '\0';
565
566 if(is_mga == false && StringNCmp(offset + lcp->bp, "bp", 2) != 0 &&
567 StringNCmp(offset + lcp->bp, "rc", 2) != 0 &&
568 StringNCmp(offset + lcp->bp, "aa", 2) != 0)
569 {
570 i = lcp->bp + 1;
571 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
572 "bp/rc string unrecognized in column %d-%d: %s",
573 i, i + 1, offset + lcp->bp);
574 ret = false;
575 }
576 if(CheckSTRAND(offset + lcp->strand) == -1)
577 {
578 i = lcp->strand + 1;
579 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
580 "Strand unrecognized in column %d-%d : %s",
581 i, i + 2, offset + lcp->strand);
582 }
583
584 p = offset + lcp->molecule;
585 if(is_mga)
586 {
587 if(StringNICmp(p, "mRNA", 4) != 0 && StringNCmp(p, "RNA", 3) != 0)
588 {
589 ErrPostEx(SEV_REJECT, ERR_FORMAT_IllegalCAGEMoltype,
590 "Illegal molecule type provided in CAGE record in LOCUS line: \"%s\". Must be \"mRNA\"or \"RNA\". Entry dropped.",
591 p);
592 ret = false;
593 }
594 }
595 else if(StringMatchIcase(ParFlat_NA_array, p) == -1)
596 {
597 if(StringMatchIcase(ParFlat_AA_array_DDBJ, p) == -1)
598 {
599 i = lcp->molecule + 1;
600 if(source != Parser::ESource::DDBJ ||
601 StringMatchIcase(ParFlat_NA_array_DDBJ, p) == -1)
602 {
603 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
604 "Molecule unrecognized in column %d-%d: %s",
605 i, i + 5, p);
606 ret = false;
607 }
608 }
609 }
610
611 if(CheckTPG(offset + lcp->topology) == -1)
612 {
613 i = lcp->topology + 1;
614 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
615 "Topology unrecognized in column %d-%d: %s",
616 i, i + 7, offset + lcp->topology);
617 ret = false;
618 }
619 if(CheckDIV(offset + lcp->div) == -1)
620 {
621 i = lcp->div + 1;
622 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
623 "Division code unrecognized in column %d-%d: %s",
624 i, i + 2, offset + lcp->div);
625 ret = (source == Parser::ESource::LANL);
626 }
627 MemCpy(date, offset + lcp->date, 11);
628 date[11] = '\0';
629 if(StringNCmp(date, "NODATE", 6) == 0)
630 {
631 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
632 "NODATE in LOCUS line will be replaced by current system date");
633 }
634 else if(!CkDateFormat(date))
635 {
636 i = lcp->date + 1;
637 ErrPostEx(SEV_WARNING, ERR_FORMAT_LocusLinePosition,
638 "Date should be in column %d-%d, and format dd-mmm-yyyy: %s",
639 i, i + 10, date);
640 ret = false;
641 }
642
643 if(p != NULL)
644 *p = '\n';
645 return(ret);
646 }
647
648 /**********************************************************
649 *
650 * CRef<objects::CDate_std> GetUpdateDate(ptr, source):
651 *
652 * Return NULL if ptr does not have dd-mmm-yyyy format
653 * or "NODATE"; otherwise, return Date-std pointer.
654 *
655 **********************************************************/
GetUpdateDate(char * ptr,Parser::ESource source)656 CRef<objects::CDate_std> GetUpdateDate(char* ptr, Parser::ESource source)
657 {
658 Char date[12];
659
660 if (StringNCmp(ptr, "NODATE", 6) == 0)
661 return CRef<objects::CDate_std>(new objects::CDate_std(CTime(CTime::eCurrent)));
662
663 if (ptr[11] != '\0' && ptr[11] != '\n' && ptr[11] != ' ' &&
664 (source != Parser::ESource::SPROT || ptr[11] != ','))
665 return CRef<objects::CDate_std>();
666
667 MemCpy(date, ptr, 11);
668 date[11] = '\0';
669
670 if (!CkDateFormat(date))
671 return CRef<objects::CDate_std>();
672
673 return get_full_date(ptr, false, source);
674 }
675
676
677 /**********************************************************/
fta_check_embl_moltype(char * str)678 static bool fta_check_embl_moltype(char* str)
679 {
680 const char **b;
681 char* p;
682 char* q;
683
684 p = StringChr(str, ';');
685 p = StringChr(p + 1, ';');
686 p = StringChr(p + 1, ';');
687
688 for(p++; *p == ' ';)
689 p++;
690
691 q = StringChr(p, ';');
692 *q = '\0';
693
694 for(b = ValidMolTypes; *b != NULL; b++)
695 if(StringCmp(p, *b) == 0)
696 break;
697
698 if(*b != NULL)
699 {
700 *q = ';';
701 return true;
702 }
703
704 ErrPostEx(SEV_REJECT, ERR_FORMAT_InvalidIDlineMolType,
705 "Invalid moltype value \"%s\" provided in ID line of EMBL record.",
706 p);
707 *q = ';';
708 return false;
709 }
710
711 /*********************************************************
712 indexblk_struct constructor
713 **********************************************************/
indexblk_struct()714 indexblk_struct::indexblk_struct() :
715 vernum(0),
716 offset(0),
717 bases(0),
718 segnum(0),
719 segtotal(0),
720 linenum(0),
721 drop(0),
722 len(0),
723 EST(false),
724 STS(false),
725 GSS(false),
726 HTC(false),
727 htg(0),
728 is_contig(false),
729 is_mga(false),
730 origin(false),
731 is_pat(false),
732 is_wgs(false),
733 is_tpa(false),
734 is_tsa(false),
735 is_tls(false),
736 is_tpa_wgs_con(false),
737 tsa_allowed(false),
738 moltype(NULL),
739 gaps(NULL),
740 secaccs(NULL),
741 xip(NULL),
742 embl_new_ID(false),
743 env_sample_qual(false),
744 is_prot(false),
745 organism(NULL),
746 taxid(0),
747 no_gc_warning(false),
748 qsoffset(0),
749 qslength(0),
750 wgs_and_gi(0),
751 got_plastid(false),
752 gc_genomic(0),
753 gc_mito(0),
754 specialist_db(false),
755 inferential(false),
756 experimental(false),
757 submitter_seqid(NULL),
758 ppp(NULL)
759 {
760 acnum[0] = 0;
761 locusname[0] = 0;
762 division[0] = 0;
763 blocusname[0] = 0;
764
765 MemSet(&lc, 0, sizeof(lc));
766
767 wgssec[0] = 0;
768 }
769
isSpace(char c)770 static bool isSpace(char c)
771 {
772 return isspace(c);
773 }
774
775
776 static CTempString::const_iterator
sFindNextSpace(const CTempString & tempString,CTempString::const_iterator current_it)777 sFindNextSpace(const CTempString& tempString,
778 CTempString::const_iterator current_it)
779 {
780 return find_if(current_it, tempString.end(), isSpace);
781 }
782
783
784 static CTempString::const_iterator
sFindNextNonSpace(const CTempString & tempString,CTempString::const_iterator current_it)785 sFindNextNonSpace(const CTempString& tempString,
786 CTempString::const_iterator current_it)
787 {
788 return find_if_not(current_it, tempString.end(), isSpace);
789 }
790
791
sSetLocusLineOffsets(const CTempString & locusLine,LocusCont & offsets)792 static void sSetLocusLineOffsets(const CTempString& locusLine, LocusCont& offsets)
793 {
794 offsets.bases = -1;
795 offsets.bp = -1;
796 offsets.strand = -1;
797 offsets.molecule = -1;
798 offsets.topology = -1;
799 offsets.div = -1;
800 offsets.date = -1;
801
802 if (locusLine.substr(0,5) != "LOCUS") {
803 // throw an exception - invalid locus line
804 }
805
806
807 auto it = sFindNextNonSpace(locusLine, locusLine.begin()+5);
808 if (it == locusLine.end()) {
809 // throw an exception - no locus name
810 }
811
812 it = sFindNextSpace(locusLine, it);
813 if (it == locusLine.end()) {
814 return;
815 }
816
817 // find the number of bases
818 it = sFindNextNonSpace(locusLine, it);
819 if (it == locusLine.end()) {
820 return;
821 }
822 auto space_it = sFindNextSpace(locusLine, it);
823 if (NStr::StringToNonNegativeInt(locusLine.substr(it-begin(locusLine), space_it-it)) == -1) {
824 return;
825 }
826
827 offsets.bases = it - begin(locusLine);
828
829 it = sFindNextNonSpace(locusLine, space_it);
830 offsets.bp = it - begin(locusLine);
831
832 it = sFindNextSpace(locusLine, it);
833 it = sFindNextNonSpace(locusLine, it);
834 // the next one might be a strand
835 // or might be a molecule
836 space_it = sFindNextSpace(locusLine, it);
837 offsets.strand = -1;
838 if ((space_it - it)==3) {
839 auto currentSubstr = locusLine.substr(it-begin(locusLine),3);
840 if (currentSubstr=="ss-" ||
841 currentSubstr=="ds-" ||
842 currentSubstr=="ms-") {
843 offsets.strand = it - begin(locusLine);
844 it = sFindNextNonSpace(locusLine, space_it);
845 }
846 offsets.molecule = it - begin(locusLine);
847 }
848 else {
849 offsets.molecule = it - begin(locusLine);
850 }
851
852 // topology
853 it = sFindNextSpace(locusLine, it);
854 it = sFindNextNonSpace(locusLine, it);
855 if (it != locusLine.end()) {
856 offsets.topology = it - begin(locusLine);
857 }
858
859 // find division
860 it = sFindNextSpace(locusLine, it);
861 it = sFindNextNonSpace(locusLine, it);
862 if (it != locusLine.end()) {
863 offsets.div = it - begin(locusLine);
864 }
865
866 // find date - date is optional
867 it = sFindNextSpace(locusLine, it);
868 it = sFindNextNonSpace(locusLine, it);
869 if (it != locusLine.end()) {
870 offsets.date = it - begin(locusLine);
871 }
872 }
873
874 /**********************************************************
875 *
876 * IndexblkPtr InitialEntry(pp, finfo):
877 *
878 * Assign the entry's value to offset, locusname,
879 * bases, linenum, drop blocusname.
880 * Swiss-prot locusname checking is different from
881 * others.
882 * Check LOCUS line column position, genbank format.
883 *
884 **********************************************************/
InitialEntry(ParserPtr pp,FinfoBlkPtr finfo)885 IndexblkPtr InitialEntry(ParserPtr pp, FinfoBlkPtr finfo)
886 {
887 Int2 i;
888 Int2 j;
889 TokenStatBlkPtr stoken;
890 TokenBlkPtr ptr;
891 char* bases;
892 IndexblkPtr entry;
893 char* p;
894
895 entry = new Indexblk;
896
897 entry->offset = finfo->pos;
898 entry->linenum = finfo->line;
899 entry->ppp = pp;
900 entry->is_tsa = false;
901 entry->is_tls = false;
902 entry->is_pat = false;
903
904 if(pp->source == Parser::ESource::PRF)
905 stoken = TokenString(finfo->str, ';');
906 else
907 stoken = TokenString(finfo->str, ' ');
908
909 bool badlocus = false;
910 if(stoken->num > 2 || (pp->format == Parser::EFormat::PRF && stoken->num > 1))
911 {
912 p = finfo->str;
913 if (pp->mode == Parser::EMode::Relaxed) {
914 sSetLocusLineOffsets(p, entry->lc);
915 } else {
916 if(StringLen(p) > 78 && p[28] == ' ' && p[63] == ' ' && p[67] == ' ')
917 {
918 entry->lc.bases = ParFlat_COL_BASES_NEW;
919 entry->lc.bp = ParFlat_COL_BP_NEW;
920 entry->lc.strand = ParFlat_COL_STRAND_NEW;
921 entry->lc.molecule = ParFlat_COL_MOLECULE_NEW;
922 entry->lc.topology = ParFlat_COL_TOPOLOGY_NEW;
923 entry->lc.div = ParFlat_COL_DIV_NEW;
924 entry->lc.date = ParFlat_COL_DATE_NEW;
925 }
926 else
927 {
928 entry->lc.bases = ParFlat_COL_BASES;
929 entry->lc.bp = ParFlat_COL_BP;
930 entry->lc.strand = ParFlat_COL_STRAND;
931 entry->lc.molecule = ParFlat_COL_MOLECULE;
932 entry->lc.topology = ParFlat_COL_TOPOLOGY;
933 entry->lc.div = ParFlat_COL_DIV;
934 entry->lc.date = ParFlat_COL_DATE;
935 }
936 }
937
938 ptr = stoken->list->next;
939 if(pp->format == Parser::EFormat::EMBL && ptr->next != NULL &&
940 ptr->next->str != NULL && StringCmp(ptr->next->str, "SV") == 0)
941 {
942 for(i = 0, p = finfo->str; *p != '\0'; p++)
943 if(*p == ';' && p[1] == ' ')
944 i++;
945
946 entry->embl_new_ID = true;
947 p = StringRChr(ptr->str, ';');
948 if(p != NULL && p[1] == '\0')
949 *p = '\0';
950
951 FtaInstallPrefix(PREFIX_LOCUS, ptr->str, NULL);
952 FtaInstallPrefix(PREFIX_ACCESSION, ptr->str, NULL);
953
954 if(i != 6 || (stoken->num != 10 && stoken->num != 11))
955 {
956 ErrPostEx(SEV_REJECT, ERR_FORMAT_BadlyFormattedIDLine,
957 "The number of fields in this EMBL record's new ID line does not fit requirements.");
958 badlocus = true;
959 }
960 else if(fta_check_embl_moltype(finfo->str) == false)
961 badlocus = true;
962 }
963
964 StringCpy(entry->locusname, ptr->str);
965 StringCpy(entry->blocusname, entry->locusname);
966 if(pp->format == Parser::EFormat::PIR || pp->format == Parser::EFormat::PRF)
967 StringCpy(entry->acnum, entry->locusname);
968
969 if(entry->embl_new_ID == false)
970 {
971 FtaInstallPrefix(PREFIX_LOCUS, entry->locusname, NULL);
972 FtaInstallPrefix(PREFIX_ACCESSION, entry->locusname, NULL);
973 }
974
975 if(pp->mode != Parser::EMode::Relaxed && !badlocus)
976 {
977 if(pp->format == Parser::EFormat::SPROT)
978 {
979 if(ptr->next == NULL || ptr->next->str == NULL ||
980 (StringNICmp(ptr->next->str, "preliminary", 11) != 0 &&
981 StringNICmp(ptr->next->str, "unreviewed", 10) != 0))
982 badlocus = CheckLocusSP(entry->locusname);
983 else
984 badlocus = false;
985 }
986 else if(pp->format == Parser::EFormat::PIR || pp->format == Parser::EFormat::PRF)
987 badlocus = false;
988 else
989 badlocus = CheckLocus(entry->locusname, pp->source);
990 }
991 }
992 else if (pp->mode != Parser::EMode::Relaxed)
993 {
994 badlocus = true;
995 ErrPostStr(SEV_ERROR, ERR_LOCUS_NoLocusName,
996 "No locus name for this entry");
997 }
998
999 if(badlocus)
1000 {
1001 p = StringChr(finfo->str, '\n');
1002 if(p != NULL)
1003 *p = '\0';
1004 ErrPostEx(SEV_ERROR, ERR_ENTRY_Skipped,
1005 "Entry skipped. LOCUS line = \"%s\".", finfo->str);
1006 if(p != NULL)
1007 *p = '\n';
1008 MemFree(entry);
1009 FreeTokenstatblk(stoken);
1010 return(NULL);
1011 }
1012
1013 if(pp->format == Parser::EFormat::PIR || pp->format == Parser::EFormat::PRF)
1014 {
1015 FreeTokenstatblk(stoken);
1016 return(entry);
1017 }
1018
1019 bases = GetResidue(stoken);
1020 if(bases != NULL)
1021 entry->bases = (size_t) atoi(bases);
1022
1023 if(pp->format == Parser::EFormat::GenBank &&
1024 entry->lc.date > -1)
1025 {
1026 /* last token in the LOCUS line is date of the update's data
1027 */
1028 for(i = 1, ptr = stoken->list; i < stoken->num; i++)
1029 ptr = ptr->next;
1030 entry->date = GetUpdateDate(ptr->str, pp->source);
1031 }
1032
1033 if(pp->source == Parser::ESource::DDBJ || pp->source == Parser::ESource::EMBL)
1034 {
1035 j = stoken->num - ((pp->format == Parser::EFormat::GenBank) ? 2 : 3);
1036 for(i = 1, ptr = stoken->list; i < j; i++)
1037 ptr = ptr->next;
1038
1039 if(pp->format == Parser::EFormat::EMBL)
1040 {
1041 if(StringNICmp(ptr->str, "TSA", 3) == 0)
1042 entry->is_tsa = true;
1043 else if(StringNICmp(ptr->str, "PAT", 3) == 0)
1044 entry->is_pat = true;
1045 }
1046
1047 ptr = ptr->next;
1048
1049 if(StringNICmp(ptr->str, "EST", 3) == 0)
1050 entry->EST = true;
1051 else if(StringNICmp(ptr->str, "STS", 3) == 0)
1052 entry->STS = true;
1053 else if(StringNICmp(ptr->str, "GSS", 3) == 0)
1054 entry->GSS = true;
1055 else if(StringNICmp(ptr->str, "HTC", 3) == 0)
1056 entry->HTC = true;
1057 else if(StringNICmp(ptr->str, "PAT", 3) == 0 &&
1058 pp->source == Parser::ESource::EMBL)
1059 entry->is_pat = true;
1060 }
1061 FreeTokenstatblk(stoken);
1062
1063 return(entry);
1064 }
1065
1066 /**********************************************************
1067 *
1068 * void DelNoneDigitTail(str):
1069 *
1070 * Delete any non digit characters from tail
1071 * of string "str".
1072 *
1073 * 3-25-93
1074 *
1075 **********************************************************/
DelNoneDigitTail(char * str)1076 void DelNoneDigitTail(char* str)
1077 {
1078 char* p;
1079
1080 if(str == NULL || *str == '\0')
1081 return;
1082
1083 for(p = str; *str != '\0'; str++)
1084 if(*str >= '0' && *str <= '9')
1085 p = str + 1;
1086
1087 *p = '\0';
1088 }
1089
sDelNonDigitTail(string & str)1090 static void sDelNonDigitTail(string& str)
1091 {
1092 if (str.empty()) {
1093 return;
1094 }
1095 auto nondigitPos = str.find_first_not_of("0123456789");
1096 if (nondigitPos != string::npos) {
1097 str = str.substr(0,nondigitPos);
1098 }
1099 }
1100
1101
1102 /**********************************************************
1103 *
1104 * Here X is an alpha character, N - numeric one.
1105 * Return values:
1106 *
1107 * 1 - XXN (AB123456)
1108 * 2 - XX_N (NZ_123456)
1109 * 3 - XXXXN (AAAA01000001)
1110 * 4 - XX_XXXXN (NZ_AAAA01000001)
1111 * 5 - XXXXXN (AAAAA1234512)
1112 * 6 - XX_XXN (NZ_AB123456)
1113 * 7 - XXXXNNSN (AAAA01S000001 - scaffolds)
1114 * 8 - XXXXXXN (AAAAAA010000001)
1115 * 0 - all others
1116 *
1117 */
IsNewAccessFormat(const Char * acnum)1118 Int4 IsNewAccessFormat(const Char* acnum)
1119 {
1120 const Char* p = acnum;
1121
1122 if(p == NULL || *p == '\0')
1123 return(0);
1124
1125 if(p[0] >= 'A' && p[0] <= 'Z' && p[1] >= 'A' && p[1] <= 'Z')
1126 {
1127 if(p[2] >= '0' && p[2] <= '9')
1128 return(1);
1129 if(p[2] == '_')
1130 {
1131 if(p[3] >= '0' && p[3] <= '9')
1132 return(2);
1133 if(p[3] >= 'A' && p[3] <= 'Z' && p[4] >= 'A' && p[4] <= 'Z')
1134 {
1135 if(p[5] >= 'A' && p[5] <= 'Z' && p[6] >= 'A' && p[6] <= 'Z' &&
1136 p[7] >= '0' && p[7] <= '9')
1137 return(4);
1138 if(p[5] >= '0' && p[5] <= '9')
1139 return(6);
1140 }
1141 }
1142 if(p[2] >= 'A' && p[2] <= 'Z' && p[3] >= 'A' && p[3] <= 'Z')
1143 {
1144 if(p[4] >= 'A' && p[4] <= 'Z' && p[5] >= 'A' && p[5] <= 'Z' &&
1145 p[6] >= '0' && p[6] <= '9')
1146 return(8);
1147 if(p[4] >= '0' && p[4] <= '9')
1148 {
1149 if(p[5] >= '0' && p[5] <= '9' && p[6] == 'S' &&
1150 p[7] >= '0' && p[7] <= '9')
1151 return(7);
1152 return(3);
1153 }
1154
1155 if(p[4] >= 'A' && p[4] <= 'Z' && p[5] >= '0' && p[6] <= '9')
1156 return(5);
1157 }
1158 }
1159 return(0);
1160 }
1161
1162 /**********************************************************/
IsValidAccessPrefix(const char * acc,char ** accpref)1163 static bool IsValidAccessPrefix(const char* acc, char** accpref)
1164 {
1165 Int4 i = IsNewAccessFormat(acc);
1166 if(i == 0 || accpref == NULL)
1167 return false;
1168
1169 if(i > 2 && i < 9)
1170 return true;
1171
1172 char** b = accpref;
1173 for (; *b != NULL; b++)
1174 {
1175 if (StringNCmp(acc, *b, StringLen(*b)) == 0)
1176 break;
1177 }
1178
1179 return (*b != NULL);
1180 }
1181
1182 /**********************************************************/
fta_if_master_wgs_accession(const char * acnum,Int4 accformat)1183 static bool fta_if_master_wgs_accession(const char* acnum, Int4 accformat)
1184 {
1185 const char* p;
1186
1187 if(accformat == 3)
1188 p = acnum + 4;
1189 else if(accformat == 8)
1190 p = acnum + 6;
1191 else if(accformat == 4)
1192 p = acnum + 7;
1193 else
1194 return false;
1195
1196 if(p[0] >= '0' && p[0] <= '9' && p[1] >= '0' && p[1] <= '9')
1197 {
1198 for(p += 2; *p == '0';)
1199 p++;
1200 if(*p == '\0')
1201 return true;
1202 return false;
1203 }
1204 return false;
1205 }
1206
1207
s_IsVDBWGSScaffold(const CTempString & accession)1208 static bool s_IsVDBWGSScaffold(const CTempString& accession)
1209 {
1210 // 4+2+S+[6,7,8]
1211 if (accession.length() < 13 ||
1212 accession.length() > 15 ||
1213 accession[6] != 'S') {
1214 return false;
1215 }
1216
1217 // check that the first 4 chars are letters
1218 if (any_of(begin(accession),
1219 begin(accession)+4,
1220 [](const char c){ return !isalpha(c); })) {
1221 return false;
1222 }
1223
1224 // check that the next 2 chars are letters
1225 if (!isdigit(accession[4]) ||
1226 !isdigit(accession[5])) {
1227 return false;
1228 }
1229
1230 // The characters after 'S' should all be digits
1231 // with at least one non-zero digit
1232
1233 // First check for digits
1234 if (any_of(begin(accession)+7,
1235 end(accession),
1236 [](const char c){ return !isdigit(c); })) {
1237 return false;
1238 }
1239
1240 // Now check to see if at least one is not zero
1241 if (all_of(begin(accession)+7,
1242 end(accession),
1243 [](const char c) { return c == '0'; })) {
1244 return false;
1245 }
1246
1247 return true;
1248 }
1249
s_RefineWGSType(const CTempString & accession,int initialType)1250 static int s_RefineWGSType(const CTempString& accession, int initialType)
1251 {
1252 if (initialType == -1) {
1253 return initialType;
1254 }
1255 // Identify as TSA or TLS
1256 if(accession[0] == 'G') /* TSA-WGS */
1257 {
1258 switch(initialType)
1259 {
1260 case 0:
1261 return 4;
1262 case 1:
1263 return 5;
1264 case 3:
1265 return 6;
1266 default:
1267 return initialType;
1268 }
1269 }
1270
1271 if (accession[0] == 'K' || accession[1] == 'T') { // TLS
1272 switch(initialType)
1273 {
1274 case 0:
1275 return 10;
1276 case 1:
1277 return 11;
1278 case 3:
1279 return 12;
1280 default:
1281 return initialType;
1282 }
1283 }
1284
1285 if (initialType == 1) { // TSA again
1286 if (accession[0] == 'I') {
1287 return 8;
1288 }
1289 if (accession[0] == 'H') {
1290 return 9;
1291 }
1292 }
1293
1294 return initialType;
1295 }
1296
1297 /**********************************************************/
1298 /* Returns: 0 - if WGS project accession;
1299 * 1 - WGS contig accession;
1300 * 2 - WGS scaffold accession (2+6);
1301 * 3 - WGS master accession (XXXX00000000);
1302 * 4 - TSA-WGS project accession;
1303 * 5 - TSA-WGS contig accession
1304 * 6 - TSA-WGS master accession;
1305 * 7 - VDB WGS scaffold accession (4+2+S+[6,7,8]);
1306 * 8 - TSA-WGS contig DDBJ accession
1307 * 9 - TSA-WGS contig EMBL accession
1308 * 10 - TLS-WGS project accession;
1309 * 11 - TLS-WGS contig accession
1310 * 12 - TLS-WGS master accession;
1311 * -1 - something else.
1312 */
fta_if_wgs_acc(const CTempString & accession)1313 int fta_if_wgs_acc(const CTempString& accession)
1314 {
1315
1316 if (accession.empty() ||
1317 NStr::IsBlank(accession)) {
1318 return -1;
1319 }
1320
1321 const auto length = accession.length();
1322
1323 if(length == 8 &&
1324 k_WgsScaffoldPrefix.find(accession.substr(0,2)) != k_WgsScaffoldPrefix.end() &&
1325 all_of(begin(accession)+2, end(accession), [](const char c) { return isdigit(c); })) {
1326 return 2;
1327 }
1328
1329 if(length > 12 && length < 16 && accession[6] == 'S')
1330 {
1331 if (s_IsVDBWGSScaffold(accession)) {
1332 return 7;
1333 }
1334 return -1;
1335 }
1336
1337 const char* p = accession.data();
1338 if(StringNCmp(p, "NZ_", 3) == 0) {
1339 p += 3;
1340 }
1341 size_t j = StringLen(p);
1342 if(j < 12 || j > 17) {
1343 return -1;
1344 }
1345
1346 if(isdigit(p[4]))
1347 {
1348 if(all_of(p, p+4, [](const char c) { return isalpha(c); }) &&
1349 all_of(p+4, end(accession), [](const char c) { return isdigit(c); })) {
1350
1351 int i = -1;
1352 if (any_of(p+6, end(accession), [](const char c) { return c != '0'; })) {
1353 i = 1; // WGS contig
1354 }
1355 else
1356 if (p[4] == '0' && p[5] == '0') {
1357 i = 3; // WGS master
1358 }
1359 else {
1360 i = 0; // WGS project
1361 }
1362 return s_RefineWGSType(p, i);
1363 }
1364 return -1;
1365 }
1366
1367
1368 // 6 letters + 2 digits
1369 if (all_of(p, p+6, [](const char c){ return isalpha(c); }) &&
1370 all_of(p+6, end(accession), [](const char c) { return isdigit(c); })) {
1371
1372 if (any_of(p+8, end(accession), [](const char c) { return c != '0'; })) {
1373 return 1; // WGS contig
1374 }
1375
1376 if (p[6] == '0' && p[7] == '0') {
1377 return 3; // WGS master
1378 }
1379 return 0; // WGS project
1380 }
1381
1382 return -1; // unknown
1383 }
1384
1385 /**********************************************************/
IsSPROTAccession(const char * acc)1386 bool IsSPROTAccession(const char* acc)
1387 {
1388 const char **b;
1389
1390 if(acc == NULL || acc[0] == '\0')
1391 return false;
1392 size_t len = StringLen(acc);
1393 if(len != 6 && len != 8 && len != 10)
1394 return false;
1395 if(len == 8)
1396 {
1397 for (b = sprot_accpref; *b != NULL; b++)
1398 {
1399 if (StringNCmp(*b, acc, 2) == 0)
1400 break;
1401 }
1402
1403 return (*b != NULL);
1404 }
1405
1406 if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < '0' || acc[1] > '9' ||
1407 ((acc[3] < '0' || acc[3] > '9') && (acc[3] < 'A' || acc[3] > 'Z')) ||
1408 ((acc[4] < '0' || acc[4] > '9') && (acc[4] < 'A' || acc[4] > 'Z')) ||
1409 acc[5] < '0' || acc[5] > '9')
1410 return false;
1411
1412 if(acc[0] >= 'O' && acc[0] <= 'Q')
1413 {
1414 if((acc[2] < '0' || acc[2] > '9') && (acc[2] < 'A' || acc[2] > 'Z'))
1415 return false;
1416 }
1417 else if(acc[2] < 'A' || acc[2] > 'Z')
1418 return false;
1419
1420 if(len == 6)
1421 return true;
1422
1423 if(acc[0] >= 'O' && acc[0] <= 'Q')
1424 return false;
1425
1426 if(acc[6] < 'A' || acc[6] > 'Z' || acc[9] < '0' || acc[9] > '9' ||
1427 ((acc[7] < 'A' || acc[7] > 'Z') && (acc[7] < '0' || acc[7] > '9')) ||
1428 ((acc[8] < 'A' || acc[8] > 'Z') && (acc[8] < '0' || acc[8] > '9')))
1429 return false;
1430
1431 return true;
1432 }
1433
1434
1435
sCheckAccession(const list<string> & tokens,Parser::ESource source,Parser::EMode mode,const char * priacc,int skip)1436 static bool sCheckAccession(const list<string>& tokens,
1437 Parser::ESource source,
1438 Parser::EMode mode,
1439 const char* priacc, int skip)
1440 {
1441 TokenBlkPtr tbp;
1442 bool badac;
1443 bool res = true;
1444 bool iswgs;
1445 Char acnum[200];
1446 Int4 accformat;
1447 Int4 priformat;
1448 Int4 count;
1449 size_t i;
1450
1451 if(priacc == NULL || mode == Parser::EMode::Relaxed)
1452 return true;
1453
1454 auto it = tokens.begin();
1455 if (skip) {
1456 advance(it, skip);
1457 }
1458
1459 priformat = IsNewAccessFormat(priacc);
1460 if((priformat == 3 || priformat == 4 || priformat == 8) &&
1461 fta_if_master_wgs_accession(priacc, priformat) == false)
1462 iswgs = true;
1463 else
1464 iswgs = false;
1465
1466 count = 0;
1467 for(; it != tokens.end(); ++it)
1468 {
1469 StringCpy(acnum, it->c_str());
1470 if(acnum[0] == '-' && acnum[1] == '\0')
1471 continue;
1472
1473 if(skip == 2 && count == 0)
1474 accformat = priformat;
1475 else
1476 accformat = IsNewAccessFormat(acnum);
1477
1478 size_t len = StringLen(acnum);
1479 if(acnum[len-1] == ';')
1480 {
1481 len--;
1482 acnum[len] = '\0';
1483 }
1484 badac = false;
1485 if(accformat == 1)
1486 {
1487 if(len != 8 && len != 10)
1488 badac = true;
1489 else
1490 {
1491 for(i = 2; i < 8 && badac == false; i++)
1492 if(acnum[i] < '0' || acnum[i] > '9')
1493 badac = true;
1494 }
1495 }
1496 else if(accformat == 2)
1497 {
1498 if(len != 9 && len != 12)
1499 badac = true;
1500 else
1501 {
1502 for(i = 3; i < len && badac == false; i++)
1503 if(acnum[i] < '0' || acnum[i] > '9')
1504 badac = true;
1505 }
1506 }
1507 else if(accformat == 3)
1508 {
1509 if(len < 12 || len > 14)
1510 badac = true;
1511 else
1512 {
1513 for(i = 4; i < len && badac == false; i++)
1514 if(acnum[i] < '0' || acnum[i] > '9')
1515 badac = true;
1516 }
1517 }
1518 else if(accformat == 8)
1519 {
1520 if(len < 15 || len > 17)
1521 badac = true;
1522 else
1523 {
1524 for(i = 6; i < len && !badac; i++)
1525 if(acnum[i] < '0' || acnum[i] > '9')
1526 badac = true;
1527 }
1528 }
1529 else if(accformat == 4)
1530 {
1531 if(len < 15 || len > 17)
1532 badac = true;
1533 else
1534 {
1535 for(i = 7; i < len && badac == false; i++)
1536 if(acnum[i] < '0' || acnum[i] > '9')
1537 badac = true;
1538 }
1539 }
1540 else if(accformat == 5)
1541 {
1542 if(len != 12)
1543 badac = true;
1544 else
1545 {
1546 for(i = 5; i < len && badac == false; i++)
1547 if(acnum[i] < '0' || acnum[i] > '9')
1548 badac = true;
1549 }
1550 }
1551 else if(accformat == 6)
1552 {
1553 if(len != 11 || acnum[0] != 'N' || acnum[1] != 'Z' ||
1554 acnum[2] != '_' || acnum[3] < 'A' || acnum[3] > 'Z' ||
1555 acnum[4] < 'A' || acnum[4] > 'Z')
1556 badac = true;
1557 else
1558 {
1559 for(i = 5; i < len && badac == false; i++)
1560 if(acnum[i] < '0' || acnum[i] > '9')
1561 badac = true;
1562 }
1563 }
1564 else if(accformat == 7)
1565 {
1566 if(len < 13 || len > 15)
1567 badac = true;
1568 else
1569 {
1570 for(i = 7; i < len && badac == false; i++)
1571 if(acnum[i] < '0' || acnum[i] > '9')
1572 badac = true;
1573 }
1574 }
1575 else if(accformat == 0)
1576 {
1577 if(len != 6 && len != 10)
1578 badac = true;
1579 else if(acnum[0] >= 'A' && acnum[0] <= 'Z')
1580 {
1581 if(source == Parser::ESource::SPROT)
1582 {
1583 if(!IsSPROTAccession(acnum))
1584 badac = true;
1585 }
1586 else if(len == 10)
1587 {
1588 badac = true;
1589 }
1590 else
1591 {
1592 for(i = 1; i < 6 && badac == false; i++)
1593 if(acnum[i] < '0' || acnum[i] > '9')
1594 badac = true;
1595 }
1596 }
1597 else
1598 badac = true;
1599 }
1600 else
1601 badac = true;
1602
1603 if(badac)
1604 {
1605 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
1606 "Bad accession #, %s for this entry", acnum);
1607 res = false;
1608 count++;
1609 continue;
1610 }
1611
1612 if(skip == 2 && count == 0 && !iswgs &&
1613 (accformat == 3 || accformat == 4 || accformat == 8))
1614 {
1615 ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSProjectAccIsPri,
1616 "This record has a WGS 'project' accession as its primary accession number. WGS project-accessions are only expected to be used as secondary accession numbers.");
1617 res = false;
1618 }
1619 count++;
1620 }
1621
1622 return(res);
1623 }
1624
1625 /**********************************************************
1626 *
1627 * static bool CheckAccession(stoken, source, entryacc,
1628 * skip):
1629 *
1630 * A valid accession number should be an upper case
1631 * letter (A-Z) followed by 5 digits, put "reject" message
1632 * if not.
1633 *
1634 * 7-6-93
1635 *
1636 **********************************************************/
CheckAccession(TokenStatBlkPtr stoken,Parser::ESource source,Parser::EMode mode,char * priacc,Int4 skip)1637 static bool CheckAccession(TokenStatBlkPtr stoken,
1638 Parser::ESource source,
1639 Parser::EMode mode,
1640 char* priacc, Int4 skip)
1641 {
1642 TokenBlkPtr tbp;
1643 bool badac;
1644 bool res = true;
1645 bool iswgs;
1646 Char acnum[200];
1647 Int4 accformat;
1648 Int4 priformat;
1649 Int4 count;
1650 size_t i;
1651
1652 if(priacc == NULL || mode == Parser::EMode::Relaxed)
1653 return true;
1654
1655 tbp = (skip == 0) ? stoken->list : stoken->list->next;
1656 priformat = IsNewAccessFormat(priacc);
1657 if((priformat == 3 || priformat == 4 || priformat == 8) &&
1658 fta_if_master_wgs_accession(priacc, priformat) == false)
1659 iswgs = true;
1660 else
1661 iswgs = false;
1662
1663 count = 0;
1664 for(; tbp != NULL; tbp = tbp->next)
1665 {
1666 StringCpy(acnum, tbp->str);
1667 if(acnum[0] == '-' && acnum[1] == '\0')
1668 continue;
1669
1670 if(skip == 2 && count == 0)
1671 accformat = priformat;
1672 else
1673 accformat = IsNewAccessFormat(acnum);
1674
1675 size_t len = StringLen(acnum);
1676 if(acnum[len-1] == ';')
1677 {
1678 len--;
1679 acnum[len] = '\0';
1680 }
1681 badac = false;
1682 if(accformat == 1)
1683 {
1684 if(len != 8 && len != 10)
1685 badac = true;
1686 else
1687 {
1688 for(i = 2; i < 8 && badac == false; i++)
1689 if(acnum[i] < '0' || acnum[i] > '9')
1690 badac = true;
1691 }
1692 }
1693 else if(accformat == 2)
1694 {
1695 if(len != 9 && len != 12)
1696 badac = true;
1697 else
1698 {
1699 for(i = 3; i < len && badac == false; i++)
1700 if(acnum[i] < '0' || acnum[i] > '9')
1701 badac = true;
1702 }
1703 }
1704 else if(accformat == 3)
1705 {
1706 if(len < 12 || len > 14)
1707 badac = true;
1708 else
1709 {
1710 for(i = 4; i < len && badac == false; i++)
1711 if(acnum[i] < '0' || acnum[i] > '9')
1712 badac = true;
1713 }
1714 }
1715 else if(accformat == 8)
1716 {
1717 if(len < 15 || len > 17)
1718 badac = true;
1719 else
1720 {
1721 for(i = 6; i < len && !badac; i++)
1722 if(acnum[i] < '0' || acnum[i] > '9')
1723 badac = true;
1724 }
1725 }
1726 else if(accformat == 4)
1727 {
1728 if(len < 15 || len > 17)
1729 badac = true;
1730 else
1731 {
1732 for(i = 7; i < len && badac == false; i++)
1733 if(acnum[i] < '0' || acnum[i] > '9')
1734 badac = true;
1735 }
1736 }
1737 else if(accformat == 5)
1738 {
1739 if(len != 12)
1740 badac = true;
1741 else
1742 {
1743 for(i = 5; i < len && badac == false; i++)
1744 if(acnum[i] < '0' || acnum[i] > '9')
1745 badac = true;
1746 }
1747 }
1748 else if(accformat == 6)
1749 {
1750 if(len != 11 || acnum[0] != 'N' || acnum[1] != 'Z' ||
1751 acnum[2] != '_' || acnum[3] < 'A' || acnum[3] > 'Z' ||
1752 acnum[4] < 'A' || acnum[4] > 'Z')
1753 badac = true;
1754 else
1755 {
1756 for(i = 5; i < len && badac == false; i++)
1757 if(acnum[i] < '0' || acnum[i] > '9')
1758 badac = true;
1759 }
1760 }
1761 else if(accformat == 7)
1762 {
1763 if(len < 13 || len > 15)
1764 badac = true;
1765 else
1766 {
1767 for(i = 7; i < len && badac == false; i++)
1768 if(acnum[i] < '0' || acnum[i] > '9')
1769 badac = true;
1770 }
1771 }
1772 else if(accformat == 0)
1773 {
1774 if(len != 6 && len != 10)
1775 badac = true;
1776 else if(acnum[0] >= 'A' && acnum[0] <= 'Z')
1777 {
1778 if(source == Parser::ESource::SPROT)
1779 {
1780 if(!IsSPROTAccession(acnum))
1781 badac = true;
1782 }
1783 else if(len == 10)
1784 {
1785 badac = true;
1786 }
1787 else
1788 {
1789 for(i = 1; i < 6 && badac == false; i++)
1790 if(acnum[i] < '0' || acnum[i] > '9')
1791 badac = true;
1792 }
1793 }
1794 else
1795 badac = true;
1796 }
1797 else
1798 badac = true;
1799
1800 if(badac)
1801 {
1802 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
1803 "Bad accession #, %s for this entry", acnum);
1804 res = false;
1805 count++;
1806 continue;
1807 }
1808
1809 if(skip == 2 && count == 0 && !iswgs &&
1810 (accformat == 3 || accformat == 4 || accformat == 8))
1811 {
1812 ErrPostEx(SEV_REJECT, ERR_ACCESSION_WGSProjectAccIsPri,
1813 "This record has a WGS 'project' accession as its primary accession number. WGS project-accessions are only expected to be used as secondary accession numbers.");
1814 res = false;
1815 }
1816 count++;
1817 }
1818
1819 return(res);
1820 }
1821
1822 /**********************************************************/
IsPatentedAccPrefix(const Parser & parseInfo,const char * acc)1823 static bool IsPatentedAccPrefix(const Parser& parseInfo, const char* acc)
1824 {
1825 if(acc[2] == '\0')
1826 {
1827 if((StringCmp(acc, "AR") == 0 || StringCmp(acc, "DZ") == 0 ||
1828 StringCmp(acc, "EA") == 0 || StringCmp(acc, "GC") == 0 ||
1829 StringCmp(acc, "GP") == 0 || StringCmp(acc, "GV") == 0 ||
1830 StringCmp(acc, "GX") == 0 || StringCmp(acc, "GY") == 0 ||
1831 StringCmp(acc, "GZ") == 0 || StringCmp(acc, "HJ") == 0 ||
1832 StringCmp(acc, "HK") == 0 || StringCmp(acc, "HL") == 0 ||
1833 StringCmp(acc, "KH") == 0 || StringCmp(acc, "MI") == 0 ||
1834 StringCmp(acc, "MM") == 0 || StringCmp(acc, "MO") == 0) &&
1835 (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1836 return true;
1837 if((StringNCmp(acc, "AX", 2) == 0 || StringNCmp(acc, "CQ", 2) == 0 ||
1838 StringNCmp(acc, "CS", 2) == 0 || StringNCmp(acc, "FB", 2) == 0 ||
1839 StringNCmp(acc, "HA", 2) == 0 || StringNCmp(acc, "HB", 2) == 0 ||
1840 StringNCmp(acc, "HC", 2) == 0 || StringNCmp(acc, "HD", 2) == 0 ||
1841 StringNCmp(acc, "HH", 2) == 0 || StringNCmp(acc, "GM", 2) == 0 ||
1842 StringNCmp(acc, "GN", 2) == 0 || StringNCmp(acc, "JA", 2) == 0 ||
1843 StringNCmp(acc, "JB", 2) == 0 || StringNCmp(acc, "JC", 2) == 0 ||
1844 StringNCmp(acc, "JD", 2) == 0 || StringNCmp(acc, "JE", 2) == 0 ||
1845 StringNCmp(acc, "HI", 2) == 0 || StringNCmp(acc, "LP", 2) == 0 ||
1846 StringNCmp(acc, "LQ", 2) == 0 || StringNCmp(acc, "MP", 2) == 0 ||
1847 StringNCmp(acc, "MQ", 2) == 0 || StringNCmp(acc, "MR", 2) == 0 ||
1848 StringNCmp(acc, "MS", 2) == 0) &&
1849 (parseInfo.all == true || parseInfo.source == Parser::ESource::EMBL))
1850 return true;
1851 if ((StringNCmp(acc, "BD", 2) == 0 || StringNCmp(acc, "DD", 2) == 0 ||
1852 StringNCmp(acc, "DI", 2) == 0 || StringNCmp(acc, "DJ", 2) == 0 ||
1853 StringNCmp(acc, "DL", 2) == 0 || StringNCmp(acc, "DM", 2) == 0 ||
1854 StringNCmp(acc, "FU", 2) == 0 || StringNCmp(acc, "FV", 2) == 0 ||
1855 StringNCmp(acc, "FW", 2) == 0 || StringNCmp(acc, "FZ", 2) == 0 ||
1856 StringNCmp(acc, "GB", 2) == 0 || StringNCmp(acc, "HV", 2) == 0 ||
1857 StringNCmp(acc, "HW", 2) == 0 || StringNCmp(acc, "HZ", 2) == 0 ||
1858 StringNCmp(acc, "LF", 2) == 0 || StringNCmp(acc, "LG", 2) == 0 ||
1859 StringNCmp(acc, "LV", 2) == 0 || StringNCmp(acc, "LX", 2) == 0 ||
1860 StringNCmp(acc, "LY", 2) == 0 || StringNCmp(acc, "LZ", 2) == 0 ||
1861 StringNCmp(acc, "MA", 2) == 0 || StringNCmp(acc, "MB", 2) == 0 ||
1862 StringNCmp(acc, "MC", 2) == 0 || StringNCmp(acc, "MD", 2) == 0 ||
1863 StringNCmp(acc, "ME", 2) == 0 || StringNCmp(acc, "OF", 2) == 0) &&
1864 (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1865 return true;
1866
1867 return false;
1868 }
1869
1870 if(acc[1] == '\0' && (*acc == 'I' || *acc == 'A' || *acc == 'E'))
1871 {
1872 if(parseInfo.all == true ||
1873 (*acc == 'I' && parseInfo.source == Parser::ESource::NCBI) ||
1874 (*acc == 'A' && parseInfo.source == Parser::ESource::EMBL) ||
1875 (*acc == 'E' && parseInfo.source == Parser::ESource::DDBJ))
1876 return true;
1877 }
1878 return false;
1879 }
1880
1881 /**********************************************************/
IsTPAAccPrefix(const Parser & parseInfo,const char * acc)1882 static bool IsTPAAccPrefix(const Parser& parseInfo, const char* acc)
1883 {
1884 if(acc == NULL)
1885 return(false);
1886
1887 size_t i = StringLen(acc);
1888 if(i != 2 && i != 4)
1889 return(false);
1890
1891 if(i == 4)
1892 {
1893 if(acc[0] == 'D' &&
1894 (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1895 return(true);
1896 if(acc[0] == 'E' &&
1897 (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1898 return(true);
1899 return(false);
1900 }
1901
1902 if(fta_StringMatch(ncbi_tpa_accpref, acc) > -1 &&
1903 (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1904 return(true);
1905 if(fta_StringMatch(ddbj_tpa_accpref, acc) > -1 &&
1906 (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1907 return(true);
1908 return(false);
1909 }
1910
1911 /**********************************************************/
IsWGSAccPrefix(const Parser & parseInfo,const char * acc)1912 static bool IsWGSAccPrefix(const Parser& parseInfo, const char* acc)
1913 {
1914 if(acc == NULL || StringLen(acc) != 2)
1915 return(false);
1916
1917 if(fta_StringMatch(ncbi_wgs_accpref, acc) > -1 &&
1918 (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1919 return(true);
1920 if(fta_StringMatch(ddbj_wgs_accpref, acc) > -1 &&
1921 (parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ))
1922 return(true);
1923 return(false);
1924 }
1925
1926 /**********************************************************/
IsTSAAccPrefix(const Parser & parseInfo,const char * acc,IndexblkPtr ibp)1927 static void IsTSAAccPrefix(const Parser& parseInfo, const char* acc, IndexblkPtr ibp)
1928 {
1929 if(acc == NULL || *acc == '\0')
1930 return;
1931
1932 if(parseInfo.source == Parser::ESource::EMBL)
1933 {
1934 ibp->tsa_allowed = true;
1935 return;
1936 }
1937
1938 if(acc[0] == 'U' && acc[1] == '\0' &&
1939 (parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI))
1940 {
1941 ibp->tsa_allowed = true;
1942 return;
1943 }
1944
1945 if(StringLen(acc) != 2 && StringLen(acc) != 4)
1946 return;
1947
1948 if(parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI)
1949 {
1950 if((StringLen(acc) == 2 &&
1951 (StringCmp(acc, "EZ") == 0 || StringCmp(acc, "HP") == 0 ||
1952 StringCmp(acc, "JI") == 0 || StringCmp(acc, "JL") == 0 ||
1953 StringCmp(acc, "JO") == 0 || StringCmp(acc, "JP") == 0 ||
1954 StringCmp(acc, "JR") == 0 || StringCmp(acc, "JT") == 0 ||
1955 StringCmp(acc, "JU") == 0 || StringCmp(acc, "JV") == 0 ||
1956 StringCmp(acc, "JW") == 0 || StringCmp(acc, "KA") == 0)) ||
1957 fta_if_wgs_acc(ibp->acnum) == 5)
1958 {
1959 ibp->is_tsa = true;
1960 ibp->tsa_allowed = true;
1961 }
1962 if(fta_StringMatch(acc_tsa_allowed, acc) > -1)
1963 ibp->tsa_allowed = true;
1964 }
1965
1966 if(parseInfo.all == true || parseInfo.source == Parser::ESource::DDBJ)
1967 {
1968 if(StringNCmp(acc, "FX", 2) == 0 || StringNCmp(acc, "LA", 2) == 0 ||
1969 StringNCmp(acc, "LE", 2) == 0 || StringNCmp(acc, "LH", 2) == 0 ||
1970 StringNCmp(acc, "LI", 2) == 0 || StringNCmp(acc, "LJ", 2) == 0 ||
1971 fta_if_wgs_acc(ibp->acnum) == 8)
1972 {
1973 ibp->is_tsa = true;
1974 ibp->tsa_allowed = true;
1975 }
1976 }
1977
1978 if(parseInfo.all == true || parseInfo.source == Parser::ESource::EMBL)
1979 {
1980 if(fta_if_wgs_acc(ibp->acnum) == 9)
1981 {
1982 ibp->is_tsa = true;
1983 ibp->tsa_allowed = true;
1984 }
1985 }
1986 }
1987
1988 /**********************************************************/
IsTLSAccPrefix(const Parser & parseInfo,const char * acc,IndexblkPtr ibp)1989 static void IsTLSAccPrefix(const Parser& parseInfo, const char* acc, IndexblkPtr ibp)
1990 {
1991 if(acc == NULL || *acc == '\0' || StringLen(acc) != 4)
1992 return;
1993
1994 if(parseInfo.all == true || parseInfo.source == Parser::ESource::NCBI ||
1995 parseInfo.source == Parser::ESource::DDBJ)
1996 if(fta_if_wgs_acc(ibp->acnum) == 11)
1997 ibp->is_tls = true;
1998 }
1999
sIsAccPrefixChar(char c)2000 static bool sIsAccPrefixChar(char c) {
2001 return (c >= 'A' && c <= 'Z');
2002 }
2003 /**********************************************************
2004 *
2005 * bool GetAccession(pp, str, entry, skip):
2006 *
2007 * Only record the first line of the first accession
2008 * number.
2009 * PIR format, accession number does not follow
2010 * the rule.
2011 *
2012 * 3-4-93
2013 *
2014 **********************************************************/
2015 /*
2016 bool GetAccession(const Parser& parseInfo, const CTempString& str, IndexblkPtr entry, int skip)
2017 {
2018 string accession;
2019 list<string> tokens;
2020 bool get = true;
2021
2022 if((skip != 2 && parseInfo.source == Parser::ESource::Flybase) ||
2023 parserInfo.source == Parser::ESource::USPTO)
2024 return true;
2025
2026 NStr::Split(str, " ;", tokens, NStr::fSplit_Tokenize);
2027
2028
2029 if (skip != 2)
2030 {
2031 get = ParseAccessionRange(tokens, skip);
2032 if (get)
2033 get = sCheckAccession(tokens, parseInfo.source, parseInfo.mode, entry->acnum, skip);
2034 if (!get)
2035 entry->drop = 1;
2036
2037 if (tokens.size()>skip && skip<2) { // Not sure about the logic
2038 auto it = skip ? next(tokens.begin(), skip) : tokens.begin();
2039 move(it, tokens.end(), entry->secondary_accessions.end());
2040 }
2041 return get;
2042 }
2043
2044 // skip == 2
2045 entry->is_tpa = false;
2046 if(tokens.size() < 2)
2047 {
2048 if (parseInfo.mode != Parser::EMode::Relaxed) {
2049 ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
2050 "No accession # for this entry, about line %ld",
2051 (long int) entry->linenum);
2052 entry->drop = 1;
2053 }
2054 return false;
2055 }
2056
2057
2058 accession = *next(tokens.begin());
2059 sDelNonDigitTail(accession);
2060
2061 StringCpy(entry->acnum, accession.c_str());
2062
2063 if (parseInfo.format != Parser::EFormat::XML) {
2064 string temp = accession;
2065 if (parseInfo.accver && entry->vernum > 0) {
2066 temp += "." + NStr::NumericToString(entry->vernum);
2067 }
2068 if (temp.empty()) {
2069 if (entry->locusname[0] != '\0') {
2070 temp = entry->locusname;
2071 }
2072 else {
2073 temp = "???";
2074 }
2075 }
2076 FtaInstallPrefix(PREFIX_ACCESSION, temp.c_str(), NULL);
2077 }
2078
2079 if (parseInfo.source == Parser::ESource::Flybase)
2080 {
2081 return true;
2082 }
2083
2084 if (accession.size() < 2) {
2085 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2086 "Wrong accession [%s] for this entry.", accession.c_str());
2087 entry->drop = 1;
2088 return false;
2089 }
2090
2091 if (sIsAccPrefixChar(accession[0]) && sIsAccPrefixChar(accession[1])) {
2092 if (parseInfo.accpref && !IsValidAccessPrefix(accession.c_str(), parseInfo.accpref)) {
2093 get = false;
2094 }
2095
2096 if (sIsAccPrefixChar(accession[2]) && sIsAccPrefixChar(accession[3])) {
2097 if (sIsAccPrefixChar(accession[4])) {
2098 accession = accession.substr(0,5);
2099 }
2100 else {
2101 accession = accession.substr(0,4);
2102 }
2103 }
2104 else if (accession[2] == '_') {
2105 accession = accession.substr(0,3);
2106 }
2107 else {
2108 accession = accession.substr(0,2);
2109 }
2110 }
2111 else {
2112 if (parseInfo.acprefix && !StringChr(parseInfo.acprefix, accession[0])) {
2113 get = false;
2114 }
2115 accession = accession.substr(0,1);
2116 }
2117
2118 if (get) {
2119 if (tokens.size() > 2) {
2120 get = ParseAccessionRange(tokens,2);
2121 if (get) {
2122 get = sCheckAccession(tokens, parseInfo.source, parseInfo.mode, entry->acnum, 2);
2123 }
2124 }
2125 }
2126 else {
2127 string sourceName = sourceNames.at(parseInfo.source);
2128 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2129 "Wrong accession # prefix [%s] for this source: %s",
2130 accession.c_str(), sourceName.c_str());
2131 }
2132
2133 entry->secondary_accessions.clear(); // Is this necessary?
2134 move(next(tokens.begin(),2), tokens.end(), entry->secondary_accessions.begin());
2135
2136 if (!entry->is_pat) {
2137 entry->is_pat = IsPatentedAccPrefix(parseInfo, accession.c_str());
2138 }
2139 entry->is_tpa = IsTPAAccPrefix(parseInfo, accession.c_str());
2140 entry->is_wgs = IsWGSAccPrefix(parseInfo, accession.c_str());
2141 IsTSAAccPrefix(parseInfo, accession.c_str(), entry);
2142 IsTLSAccPrefix(parseInfo, accession.c_str(), entry);
2143
2144 auto i = IsNewAccessFormat(entry->acnum);
2145 if(i == 3 || i == 8)
2146 {
2147 entry->is_wgs = true;
2148 entry->wgs_and_gi |= 02;
2149 }
2150 else if(i == 5)
2151 {
2152 char* p = entry->acnum;
2153 if(parseInfo.source != Parser::ESource::DDBJ || *p != 'A' || StringLen(p) != 12 ||
2154 StringCmp(p + 5, "0000000") != 0)
2155 {
2156 string sourceName = sourceNames.at(parseInfo.source);
2157 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2158 "Wrong accession \"%s\" for this source: %s",
2159 p, sourceName.c_str());
2160 get = false;
2161 }
2162 entry->is_mga = true;
2163 }
2164
2165 if(!get)
2166 entry->drop = 1;
2167
2168 return get;
2169 }
2170 */
2171
2172
GetAccession(ParserPtr pp,char * str,IndexblkPtr entry,Int4 skip)2173 bool GetAccession(ParserPtr pp, char* str, IndexblkPtr entry, Int4 skip)
2174 {
2175 Char acc[200];
2176 Char temp[400];
2177 char* line;
2178 char* p;
2179 TokenStatBlkPtr stoken;
2180 TokenBlkPtr tbp;
2181 TokenBlkPtr ttbp;
2182 bool get = true;
2183 Int4 i;
2184
2185 if((skip != 2 && pp->source == Parser::ESource::Flybase) ||
2186 pp->source == Parser::ESource::USPTO)
2187 return true;
2188
2189 line = StringSave(str);
2190 for(p = line; *p != '\0'; p++)
2191 if(*p == ';')
2192 *p = ' ';
2193 stoken = TokenString(line, ' ');
2194
2195 if(skip != 2)
2196 {
2197 get = ParseAccessionRange(stoken, skip);
2198 if(get)
2199 get = CheckAccession(stoken, pp->source, pp->mode, entry->acnum, skip);
2200 if(!get)
2201 entry->drop = 1;
2202
2203 if(skip == 0)
2204 {
2205 tbp = stoken->list;
2206 stoken->list = NULL;
2207 }
2208 else if(skip == 1 && stoken->list != NULL)
2209 {
2210 tbp = stoken->list->next;
2211 stoken->list->next = NULL;
2212 }
2213 else
2214 tbp = NULL;
2215 if(tbp != NULL)
2216 {
2217 if(entry->secaccs == NULL)
2218 entry->secaccs = tbp;
2219 else
2220 {
2221 for(ttbp = entry->secaccs; ttbp->next != NULL;)
2222 ttbp = ttbp->next;
2223 ttbp->next = tbp;
2224 }
2225 }
2226
2227 FreeTokenstatblk(stoken);
2228 MemFree(line);
2229 return(get);
2230 }
2231
2232 entry->is_tpa = false;
2233 acc[0] = '\0';
2234 if(stoken->num < 2)
2235 {
2236 if (pp->mode != Parser::EMode::Relaxed) {
2237 ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
2238 "No accession # for this entry, about line %ld",
2239 (long int) entry->linenum);
2240 entry->drop = 1;
2241 }
2242 FreeTokenstatblk(stoken);
2243 MemFree(line);
2244 return false;
2245 }
2246
2247 StringCpy(acc, stoken->list->next->str); /* get first accession */
2248
2249 if (pp->mode != Parser::EMode::Relaxed) {
2250 DelNoneDigitTail(acc);
2251 }
2252
2253 StringCpy(entry->acnum, acc);
2254
2255 if(pp->format != Parser::EFormat::XML)
2256 {
2257 if(pp->accver && entry->vernum > 0)
2258 sprintf(temp, "%s.%d", acc, entry->vernum);
2259 else
2260 StringCpy(temp, acc);
2261
2262 if(*temp == '\0')
2263 {
2264 if(entry->locusname[0] != '\0')
2265 StringCpy(temp, entry->locusname);
2266 else
2267 StringCpy(temp, "???");
2268 }
2269 FtaInstallPrefix(PREFIX_ACCESSION, temp, NULL);
2270 }
2271
2272 if(pp->source == Parser::ESource::Flybase)
2273 {
2274 FreeTokenstatblk(stoken);
2275 MemFree(line);
2276 return true;
2277 }
2278
2279 if((StringLen(acc) < 2) &&
2280 pp->mode != Parser::EMode::Relaxed)
2281 {
2282 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2283 "Wrong accession [%s] for this entry.", acc);
2284 FreeTokenstatblk(stoken);
2285 entry->drop = 1;
2286 MemFree(line);
2287 return false;
2288 }
2289
2290 if (pp->mode != Parser::EMode::Relaxed) {
2291 if(acc[0] >= 'A' && acc[0] <= 'Z' && acc[1] >= 'A' && acc[1] <= 'Z')
2292 {
2293 if(IsValidAccessPrefix(acc, pp->accpref) == false && pp->accpref != NULL)
2294 get = false;
2295 if(acc[2] >= 'A' && acc[2] <= 'Z' && acc[3] >= 'A' && acc[3] <= 'Z')
2296 {
2297 if(acc[4] >= 'A' && acc[4] <= 'Z') {
2298 acc[5] = '\0';
2299 }
2300 else {
2301 acc[4] = '\0';
2302 }
2303 }
2304 else if(acc[2] == '_') {
2305 acc[3] = '\0';
2306 }
2307 else {
2308 acc[2] = '\0';
2309 }
2310 }
2311 else
2312 {
2313 /* Processing of accession numbers in old format
2314 */
2315 /* check valid prefix accession number
2316 */
2317 if(pp->acprefix != NULL && StringChr(pp->acprefix, *acc) == NULL)
2318 get = false;
2319 acc[1] = '\0';
2320 }
2321 }
2322
2323 if(get)
2324 {
2325 if (stoken->num > 2)
2326 get = ParseAccessionRange(stoken, 2);
2327 if (get) {
2328 get = CheckAccession(stoken, pp->source, pp->mode, entry->acnum, 2);
2329 }
2330 }
2331 else
2332 {
2333 string sourceName = sourceNames.at(pp->source);
2334 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2335 "Wrong accession # prefix [%s] for this source: %s",
2336 acc, sourceName.c_str());
2337 }
2338
2339 entry->secaccs = stoken->list->next->next;
2340 stoken->list->next->next = NULL;
2341
2342 FreeTokenstatblk(stoken);
2343
2344 if(!entry->is_pat)
2345 entry->is_pat = IsPatentedAccPrefix(*pp, acc);
2346 entry->is_tpa = IsTPAAccPrefix(*pp, acc);
2347 entry->is_wgs = IsWGSAccPrefix(*pp, acc);
2348 IsTSAAccPrefix(*pp, acc, entry);
2349 IsTLSAccPrefix(*pp, acc, entry);
2350
2351 i = IsNewAccessFormat(entry->acnum);
2352 if(i == 3 || i == 8)
2353 {
2354 entry->is_wgs = true;
2355 entry->wgs_and_gi |= 02;
2356 }
2357 else if(i == 5)
2358 {
2359 p = entry->acnum;
2360 if(pp->source != Parser::ESource::DDBJ || *p != 'A' || StringLen(p) != 12 ||
2361 StringCmp(p + 5, "0000000") != 0)
2362 {
2363 string sourceName = sourceNames.at(pp->source);
2364 ErrPostEx(SEV_ERROR, ERR_ACCESSION_BadAccessNum,
2365 "Wrong accession \"%s\" for this source: %s",
2366 p, sourceName.c_str());
2367 get = false;
2368 }
2369 entry->is_mga = true;
2370 }
2371
2372 MemFree(line);
2373
2374 if(!get)
2375 entry->drop = 1;
2376
2377 return(get);
2378 }
2379
2380 /**********************************************************/
ResetParserStruct(ParserPtr pp)2381 void ResetParserStruct(ParserPtr pp)
2382 {
2383 if(pp == NULL)
2384 return;
2385
2386 if(pp->entrylist != NULL)
2387 {
2388 for(Int4 i = 0; i < pp->indx; i++)
2389 if(pp->entrylist[i] != NULL)
2390 FreeIndexblk(pp->entrylist[i]);
2391
2392 MemFree(pp->entrylist);
2393 pp->entrylist = NULL;
2394 }
2395
2396 pp->indx = 0;
2397 pp->curindx = 0;
2398
2399 if(pp->pbp != NULL)
2400 {
2401 if(pp->pbp->ibp != NULL)
2402 delete pp->pbp->ibp;
2403 delete pp->pbp;
2404 pp->pbp = NULL;
2405 }
2406
2407
2408 if(pp->operon != NULL)
2409 {
2410 fta_operon_free(pp->operon);
2411 pp->operon = NULL;
2412 }
2413 }
2414
2415 /**********************************************************
2416 *
2417 * void FreeParser(pp):
2418 *
2419 * 3-5-93
2420 *
2421 **********************************************************/
2422 /*
2423 void FreeParser(ParserPtr pp)
2424 {
2425 if(pp == NULL)
2426 return;
2427
2428 ResetParserStruct(pp);
2429
2430 if(pp->fpo != NULL)
2431 MemFree(pp->fpo);
2432 delete pp;
2433 }
2434 */
2435
2436 /**********************************************************
2437 *
2438 * void CloseFiles(pp):
2439 *
2440 * 3-4-93
2441 *
2442 **********************************************************/
CloseFiles(ParserPtr pp)2443 void CloseFiles(ParserPtr pp)
2444 {
2445 if(pp->qsfd != NULL)
2446 {
2447 fclose(pp->qsfd);
2448 pp->qsfd = NULL;
2449 }
2450 }
2451
2452 /**********************************************************
2453 *
2454 * void MsgSkipTitleFail(flatfile, finfo):
2455 *
2456 * 7-2-93
2457 *
2458 **********************************************************/
MsgSkipTitleFail(const char * flatfile,FinfoBlkPtr finfo)2459 void MsgSkipTitleFail(const char *flatfile, FinfoBlkPtr finfo)
2460 {
2461 ErrPostEx(SEV_ERROR, ERR_ENTRY_Begin,
2462 "No valid beginning of entry found in %s file", flatfile);
2463
2464 MemFree(finfo);
2465 }
2466
2467
2468 /**********************************************************/
FindNextEntryBuf(bool end_of_file,FileBuf & fbuf,FinfoBlkPtr finfo,const char * str,Int2 len)2469 bool FindNextEntryBuf(bool end_of_file, FileBuf& fbuf, FinfoBlkPtr finfo, const char *str, Int2 len)
2470 {
2471 bool done = end_of_file;
2472 while (!done && StringNCmp(finfo->str, str, len) != 0)
2473 done = XReadFileBuf(fbuf, finfo);
2474
2475 return(done);
2476 }
2477
2478
FindNextEntryBuf(bool end_of_file,FileBuf & fbuf,FinfoBlkPtr finfo,const CTempString & keyword)2479 bool FindNextEntryBuf(bool end_of_file, FileBuf& fbuf, FinfoBlkPtr finfo, const CTempString& keyword)
2480 {
2481 return FindNextEntryBuf(end_of_file, fbuf, finfo, keyword.data(), keyword.size());
2482 }
2483
2484
2485 /**********************************************************
2486 *
2487 * bool FlatFileIndex(pp, (*fun)()):
2488 *
2489 * 10-6-93
2490 *
2491 **********************************************************/
FlatFileIndex(ParserPtr pp,void (* fun)(IndexblkPtr entry,char * offset,Int4 len))2492 bool FlatFileIndex(ParserPtr pp, void (*fun)(IndexblkPtr entry, char* offset, Int4 len))
2493 {
2494 bool index;
2495
2496 switch(pp->format)
2497 {
2498 case Parser::EFormat::GenBank:
2499 index = GenBankIndex(pp);
2500 break;
2501 case Parser::EFormat::EMBL:
2502 index = EmblIndex(pp, fun);
2503 break;
2504 case Parser::EFormat::SPROT:
2505 index = SprotIndex(pp, fun);
2506 break;
2507 case Parser::EFormat::PRF:
2508 index = PrfIndex(pp, fun);
2509 break;
2510 case Parser::EFormat::PIR:
2511 index = PirIndex(pp, fun);
2512 break;
2513 case Parser::EFormat::XML:
2514 index = XMLIndex(pp);
2515 break;
2516 default:
2517 index = false;
2518 fprintf(stderr, "Unknown flatfile format.\n");
2519 break;
2520 }
2521 return(index);
2522 }
2523
2524 /**********************************************************/
GetAccArray(Parser::ESource source)2525 const char **GetAccArray(Parser::ESource source)
2526 {
2527 if(source == Parser::ESource::EMBL)
2528 return(embl_accpref);
2529 if(source == Parser::ESource::PIR)
2530 return(pir_accpref);
2531 if(source == Parser::ESource::PRF)
2532 return(prf_accpref);
2533 if(source == Parser::ESource::SPROT)
2534 return(sprot_accpref);
2535 if(source == Parser::ESource::LANL)
2536 return(lanl_accpref);
2537 if(source == Parser::ESource::DDBJ)
2538 return(ddbj_accpref);
2539 if(source == Parser::ESource::NCBI)
2540 return(ncbi_accpref);
2541 if(source == Parser::ESource::Refseq)
2542 return(refseq_accpref);
2543 return(NULL);
2544 }
2545
2546 /**********************************************************/
GetNucAccOwner(const char * acc,bool is_tpa)2547 CSeq_id::E_Choice GetNucAccOwner(const char* acc, bool is_tpa)
2548 {
2549 Char p[4];
2550 const char*q;
2551
2552 if(acc == NULL)
2553 return objects::CSeq_id::e_not_set;
2554
2555 size_t len = StringLen(acc);
2556
2557 if(len > 8 && acc[2] == '_')
2558 {
2559 p[0] = acc[0];
2560 p[1] = acc[1];
2561 p[2] = acc[2];
2562 p[3] = '\0';
2563 if(MatchArrayString(refseq_accpref, p) > -1)
2564 {
2565 for(q = acc + 3; *q != '\0'; q++)
2566 {
2567 if(*q >= '0' && *q <= '9')
2568 continue;
2569 break;
2570 }
2571 if(*q == '\0')
2572 {
2573 return(objects::CSeq_id::e_Other);
2574 }
2575 }
2576 }
2577
2578 if(len != 6 && (len < 8 || len > 17))
2579 return objects::CSeq_id::e_not_set;
2580
2581 if(len == 11)
2582 {
2583 if(acc[0] == 'N' && acc[1] == 'Z' && acc[2] == '_' &&
2584 acc[3] >= 'A' && acc[3] <= 'Z' && acc[4] >= 'A' && acc[4] <= 'Z')
2585 {
2586 for(q = acc + 5; *q != '\0'; q++)
2587 if(*q < '0' || *q > '9')
2588 break;
2589 if(*q == '\0')
2590 {
2591 return objects::CSeq_id::e_Other;
2592 }
2593 }
2594 return objects::CSeq_id::e_not_set;
2595 }
2596
2597 if(len == 6)
2598 {
2599 if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < '0' || acc[1] > '9' ||
2600 acc[2] < '0' || acc[2] > '9' || acc[3] < '0' || acc[3] > '9' ||
2601 acc[4] < '0' || acc[4] > '9' || acc[5] < '0' || acc[5] > '9')
2602 return objects::CSeq_id::e_not_set;
2603
2604 if(StringChr(ParFlat_NCBI_AC, acc[0]) != NULL)
2605 {
2606 return objects::CSeq_id::e_Genbank;
2607 }
2608 if(StringChr(ParFlat_LANL_AC, acc[0]) != NULL)
2609 {
2610 return objects::CSeq_id::e_Genbank;
2611 }
2612 if(StringChr(ParFlat_DDBJ_AC, acc[0]) != NULL)
2613 {
2614 return objects::CSeq_id::e_Ddbj;
2615 }
2616 if(StringChr(ParFlat_EMBL_AC, acc[0]) != NULL)
2617 {
2618 if (!is_tpa)
2619 return objects::CSeq_id::e_Embl;
2620 return objects::CSeq_id::e_Tpe;
2621 }
2622 return objects::CSeq_id::e_not_set;
2623 }
2624
2625 if(len > 11 && len < 16 && acc[2] != '_')
2626 {
2627 if(len == 12 && acc[0] == 'A' && acc[1] >= 'A' && acc[1] <= 'Z' &&
2628 acc[2] >= 'A' && acc[2] <= 'Z' && acc[3] >= 'A' &&
2629 acc[3] <= 'Z' && acc[4] >= 'A' && acc[4] <= 'Z' &&
2630 StringCmp(acc + 5, "0000000") == 0)
2631 {
2632 return(objects::CSeq_id::e_Ddbj);
2633 }
2634
2635 if(((acc[0] < 'A' || acc[0] > 'S') &&
2636 (acc[0] < 'T' || acc[0] > 'W')) ||
2637 acc[1] < 'A' || acc[1] > 'Z' || acc[2] < 'A' || acc[2] > 'Z' ||
2638 acc[3] < 'A' || acc[3] > 'Z' || acc[4] < '0' || acc[4] > '9' ||
2639 acc[5] < '0' || acc[5] > '9' ||
2640 ((acc[6] < '0' || acc[6] > '9') && acc[6] != 'S') ||
2641 acc[7] < '0' || acc[7] > '9' ||
2642 acc[8] < '0' || acc[8] > '9' || acc[9] < '0' || acc[9] > '9' ||
2643 acc[10] < '0' || acc[10] > '9' || acc[11] < '0' || acc[11] > '9')
2644 {
2645 if(len != 15)
2646 return objects::CSeq_id::e_not_set;
2647 }
2648
2649 if(len == 12 && acc[6] == 'S')
2650 return objects::CSeq_id::e_not_set;
2651 if(len == 15 && acc[6] != 'S' && acc[5] >= '0' && acc[5] <= '9')
2652 return objects::CSeq_id::e_not_set;
2653
2654 if(len > 12 && (acc[12] < '0' || acc[12] > '9'))
2655 return objects::CSeq_id::e_not_set;
2656 if (len > 13 && (acc[13] < '0' || acc[13] > '9'))
2657 return objects::CSeq_id::e_not_set;
2658 if (len > 14 && (acc[14] < '0' || acc[14] > '9'))
2659 return objects::CSeq_id::e_not_set;
2660
2661 if(acc[0] == 'A' || acc[0] == 'D' || acc[0] == 'G' ||
2662 (acc[0] > 'I' && acc[0] < 'O') || (acc[0] > 'O' && acc[0] < 'T') ||
2663 acc[0] == 'V' || acc[0] == 'W')
2664 {
2665 if(acc[0] == 'D')
2666 return objects::CSeq_id::e_Tpg;
2667 return objects::CSeq_id::e_Genbank;
2668 }
2669 if(acc[0] == 'B' || acc[0] == 'E' || acc[0] == 'I' || acc[0] == 'T')
2670 {
2671 if(acc[0] == 'E')
2672 return objects::CSeq_id::e_Tpd;
2673 return objects::CSeq_id::e_Ddbj;
2674 }
2675 if(acc[0] == 'C' || acc[0] == 'F' || acc[0] == 'O' || acc[0] == 'H' ||
2676 acc[0] == 'U')
2677 {
2678 if (!is_tpa)
2679 return objects::CSeq_id::e_Embl;
2680 return objects::CSeq_id::e_Tpe;
2681 }
2682 if(len != 15)
2683 return objects::CSeq_id::e_not_set;
2684 }
2685
2686 if(len > 14 && len < 18)
2687 {
2688 if(acc[2] == '_')
2689 {
2690 if(acc[0] != 'N' || acc[1] != 'Z' || acc[2] != '_' ||
2691 ((acc[3] < 'A' || acc[3] > 'J') &&
2692 (acc[3] < 'L' || acc[3] > 'N')) ||
2693 acc[4] < 'A' || acc[4] > 'Z' || acc[5] < 'A' || acc[5] > 'Z' ||
2694 acc[6] < 'A' || acc[6] > 'Z' || acc[7] < '0' || acc[7] > '9' ||
2695 acc[8] < '0' || acc[8] > '9' || acc[9] < '0' || acc[9] > '9' ||
2696 acc[10] < '0' || acc[10] > '9' || acc[11] < '0' ||
2697 acc[11] > '9' || acc[12] < '0' || acc[12] > '9' ||
2698 acc[13] < '0' || acc[13] > '9' || acc[14] < '0' || acc[14] > '9')
2699 return objects::CSeq_id::e_not_set;
2700
2701 if(len > 15 && (acc[15] < '0' || acc[15] > '9'))
2702 return objects::CSeq_id::e_not_set;
2703 if(len > 16 && (acc[16] < '0' || acc[16] > '9'))
2704 return objects::CSeq_id::e_not_set;
2705 return objects::CSeq_id::e_Other;
2706 }
2707 if((acc[0] != 'A' && acc[0] != 'B' && acc[0] != 'C') ||
2708 acc[1] < 'A' || acc[1] > 'Z' || acc[2] < 'A' || acc[2] > 'Z' ||
2709 acc[3] < 'A' || acc[3] > 'Z' || acc[4] < 'A' || acc[4] > 'Z' ||
2710 acc[5] < 'A' || acc[5] > 'Z' || acc[6] < '0' || acc[6] > '9' ||
2711 acc[7] < '0' || acc[7] > '9' || acc[8] < '0' || acc[8] > '9' ||
2712 acc[9] < '0' || acc[9] > '9' || acc[10] < '0' || acc[10] > '9' ||
2713 acc[11] < '0' || acc[11] > '9' || acc[12] < '0' || acc[12] > '9' ||
2714 acc[13] < '0' || acc[13] > '9' || acc[14] < '0' || acc[14] > '9')
2715 return objects::CSeq_id::e_not_set;
2716
2717 if(len > 15 && (acc[15] < '0' || acc[15] > '9'))
2718 return objects::CSeq_id::e_not_set;
2719 if(len > 16 && (acc[16] < '0' || acc[16] > '9'))
2720 return objects::CSeq_id::e_not_set;
2721
2722 if(acc[0] == 'A')
2723 {
2724 return objects::CSeq_id::e_Genbank;
2725 }
2726 if(acc[0] == 'B')
2727 {
2728 return objects::CSeq_id::e_Ddbj;
2729 }
2730 if(acc[0] == 'C')
2731 {
2732 return objects::CSeq_id::e_Embl;
2733 }
2734 return objects::CSeq_id::e_not_set;
2735 }
2736
2737 q = acc + ((len == 8 || len == 10) ? 2 : 3);
2738 if(q[0] < '0' || q[0] > '9' || q[1] < '0' || q[1] > '9' ||
2739 q[2] < '0' || q[2] > '9' || q[3] < '0' || q[3] > '9' ||
2740 q[4] < '0' || q[4] > '9' || q[5] < '0' || q[5] > '9')
2741 return objects::CSeq_id::e_not_set;
2742
2743 if(len == 9)
2744 {
2745 p[0] = acc[0];
2746 p[1] = acc[1];
2747 p[2] = acc[2];
2748 p[3] = '\0';
2749 if(MatchArrayString(refseq_accpref, p) > -1)
2750 {
2751 return objects::CSeq_id::e_Other;
2752 }
2753 return objects::CSeq_id::e_not_set;
2754 }
2755
2756 if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < 'A' || acc[1] > 'Z')
2757 return objects::CSeq_id::e_not_set;
2758
2759 p[0] = acc[0];
2760 p[1] = acc[1];
2761 p[2] = '\0';
2762 if(MatchArrayString(ncbi_accpref, p) > -1)
2763 {
2764 if(MatchArrayString(ncbi_tpa_accpref, p) > -1)
2765 return objects::CSeq_id::e_Tpg;
2766 return objects::CSeq_id::e_Genbank;
2767 }
2768 if(MatchArrayString(lanl_accpref, p) > -1)
2769 {
2770 return objects::CSeq_id::e_Genbank;
2771 }
2772 if(MatchArrayString(ddbj_accpref, p) > -1)
2773 {
2774 if(MatchArrayString(ddbj_tpa_accpref, p) > -1)
2775 return objects::CSeq_id::e_Tpd;
2776 return objects::CSeq_id::e_Ddbj;
2777 }
2778 if(MatchArrayString(embl_accpref, p) > -1)
2779 {
2780 if (!is_tpa)
2781 return objects::CSeq_id::e_Embl;
2782 return objects::CSeq_id::e_Tpe;
2783 }
2784
2785 return objects::CSeq_id::e_not_set;
2786 }
2787
2788
2789
2790 /**********************************************************/
GetProtAccOwner(const Char * acc)2791 Uint1 GetProtAccOwner(const Char* acc)
2792 {
2793 const Char* q;
2794 Char p[4];
2795
2796 if(acc == NULL)
2797 return(0);
2798
2799 size_t len = StringLen(acc);
2800 if(len == 9 || len == 12)
2801 {
2802 p[0] = acc[0];
2803 p[1] = acc[1];
2804 p[2] = acc[2];
2805 p[3] = '\0';
2806 if(MatchArrayString(refseq_prot_accpref, p) > -1)
2807 {
2808 for(q = &acc[3]; *q >= '0' && *q <= '9';)
2809 q++;
2810 if(*q == '\0')
2811 return objects::CSeq_id::e_Other;
2812 }
2813 return(0);
2814 }
2815
2816 if(len != 8 && len != 10)
2817 return(0);
2818
2819 if(acc[0] < 'A' || acc[0] > 'Z' || acc[1] < 'A' || acc[1] > 'Z' ||
2820 acc[2] < 'A' || acc[2] > 'Z' || acc[3] < '0' || acc[3] > '9' ||
2821 acc[4] < '0' || acc[4] > '9' || acc[5] < '0' || acc[5] > '9' ||
2822 acc[6] < '0' || acc[6] > '9' || acc[7] < '0' || acc[7] > '9')
2823 {
2824 if(len == 8)
2825 return(0);
2826 if(acc[8] < '0' || acc[8] > '9' || acc[9] < '0' || acc[9] > '9')
2827 return(0);
2828 }
2829
2830 if(acc[0] == 'D' || acc[0] == 'H')
2831 return objects::CSeq_id::e_Tpg;
2832 if(acc[0] == 'F' || acc[0] == 'I')
2833 return objects::CSeq_id::e_Tpd;
2834 if(acc[0] == 'A' || acc[0] == 'E' || acc[0] == 'J' || acc[0] == 'K' ||
2835 (acc[0] > 'L' && acc[0] < 'S') || acc[0] == 'T' || acc[0] == 'U')
2836 return objects::CSeq_id::e_Genbank;
2837 if(acc[0] == 'B' || acc[0] == 'G' || acc[0] == 'L')
2838 return objects::CSeq_id::e_Ddbj;
2839 if(acc[0] == 'C' || acc[0] == 'S' || acc[0] == 'V')
2840 return objects::CSeq_id::e_Embl;
2841
2842 return(0);
2843 }
2844
2845 END_NCBI_SCOPE
2846