1 /* gb_index.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: gb_index.c
28 *
29 * Author: Karl Sirotkin, Hsiu-Chuan Chen
30 *
31 * File Description:
32 * -----------------
33 * Parsing genbank to memory blocks. Build Genbank format index block.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39
40 #include "index.h"
41 #include "genbank.h"
42
43 #include "ftaerr.hpp"
44 #include "indx_blk.h"
45 #include "indx_def.h"
46 #include "utilfun.h"
47 #include "entry.h"
48
49 #ifdef THIS_FILE
50 # undef THIS_FILE
51 #endif
52 #define THIS_FILE "gb_index.cpp"
53
54 BEGIN_NCBI_SCOPE
55
56 KwordBlk genbankKeywordLength[] = {
57 {"LOCUS", 5}, {"DEFINITION", 10}, {"ACCESSION", 9}, {"NID", 3},
58 {"GSDB ID", 7}, {"KEYWORDS", 8}, {"SEGMENT", 7}, {"SOURCE", 6},
59 {"REFERENCE", 9}, {"COMMENT", 7}, {"FEATURES", 8}, {"BASE COUNT", 10},
60 {"ORIGIN", 6}, {"//", 2}, {"GSDBID", 6}, {"CONTIG", 6},
61 {"VERSION", 7}, {"USER", 4}, {"WGS", 3}, {"PRIMARY", 7},
62 {"MGA", 3}, {"PROJECT", 7}, {"DBLINK", 6}, {NULL, 0}
63 };
64
65 // LCOV_EXCL_START
66 // Excluded per Mark's request on 12/14/2016
67 /**********************************************************
68 *
69 * static bool DelSegnum(str, segnum, len2):
70 *
71 * Strip off segnum which has number of "len1" digits,
72 * then check if any tailing zero existed.
73 * Subroutine return:
74 * TRUE if
75 * - there is no tailing zero or
76 * - the number of the tailing zero is equal or greater
77 * than (len2-len1) (i.e. strip off len2-len1 of "0").
78 * FALSE and no change in the string "str" if
79 * - len2-len1 less than zero or
80 * - there is not enough "len1" digits at end of
81 * the string "str" or
82 * - there is not enough len2-len1 zero at end of
83 * the string "str".
84 *
85 * February 25 1993
86 *
87 **********************************************************/
DelSegnum(IndexblkPtr entry,char * segnum,size_t len2)88 static bool DelSegnum(IndexblkPtr entry, char* segnum, size_t len2)
89 {
90 char* str;
91 char* p;
92 char* q;
93
94 if(segnum == NULL)
95 return false;
96 size_t len1 = StringLen(segnum);
97 if(len2 < len1)
98 return false;
99
100 /* check, is there enough digits to delete
101 */
102 size_t tlen = len1;
103 str = entry->blocusname;
104 size_t i = StringLen(str) - 1;
105 for(; tlen > 0 && str[i] >= '0' && str[i] <= '9'; i--)
106 tlen--;
107
108 if(tlen != 0 || i < 0)
109 return false;
110
111 if(len2 > len1 && str[i] == '0')
112 {
113 /* check, is there enough "0" appended
114 */
115 for(tlen = len2 - len1; tlen > 0 && str[i] == '0'; i--)
116 tlen--;
117
118 if(tlen != 0)
119 return false;
120 }
121
122 for(q = &str[i+1], p = q; *p == '0';)
123 p++;
124
125 i = atoi(segnum);
126 if((size_t) atoi(p) != i)
127 {
128 ErrPostEx(SEV_REJECT, ERR_SEGMENT_BadLocusName,
129 "Segment suffix in locus name \"%s\" does not match number in SEGMENT line = \"%d\". Entry dropped.",
130 str, i);
131 entry->drop = 1;
132 }
133
134 *q = '\0'; /* strip off "len" characters */
135 return true;
136 }
137
138 /**********************************************************/
GetSegment(char * str,IndexblkPtr entry)139 static void GetSegment(char* str, IndexblkPtr entry)
140 {
141 TokenStatBlkPtr stoken;
142 TokenBlkPtr ptr2;
143 TokenBlkPtr ptr4;
144
145 stoken = TokenString(str, ' ');
146
147 if(stoken->num > 3)
148 {
149 ptr2 = stoken->list->next;
150 ptr4 = ptr2->next->next;
151 entry->segnum = (Uint2) atoi(ptr2->str);
152
153 if(!DelSegnum(entry, ptr2->str, StringLen(ptr4->str)))
154 {
155 ErrPostEx(SEV_ERROR, ERR_SEGMENT_BadLocusName,
156 "Bad locus name %s in %d",
157 entry->blocusname, entry->linenum);
158 }
159
160 entry->segtotal = (Uint2) atoi(ptr4->str);
161 }
162 else
163 {
164 ErrPostEx(SEV_ERROR, ERR_SEGMENT_IncompSeg,
165 "Incomplete Segment information at linenum %d",
166 entry->linenum);
167 }
168
169 FreeTokenstatblk(stoken);
170 }
171 // LCOV_EXCL_STOP
172
173 /**********************************************************/
gb_err_field(char * str)174 static Uint1 gb_err_field(char* str)
175 {
176 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
177 "No %s data in GenBank format file, entry dropped", str);
178 return(1);
179 }
180
181 /**********************************************************/
ParseGenBankVersion(IndexblkPtr entry,char * line,char * nid,Parser::ESource source,Parser::EMode mode,bool ign_toks)182 static void ParseGenBankVersion(IndexblkPtr entry, char* line, char* nid,
183 Parser::ESource source,
184 Parser::EMode mode,
185 bool ign_toks)
186 {
187 bool gi;
188 char* p;
189 char* q;
190 char* r;
191 Char ch;
192 Char ch1;
193
194 if(line == NULL)
195 return;
196
197 for(p = line; *p != '\0' && *p != ' ' && *p != '\t';)
198 p++;
199 gi = (*p == '\0') ? false : true;
200
201 ch1 = *p;
202 *p = '\0';
203 q = StringRChr(line, '.');
204 if(q == NULL)
205 {
206 if (mode != Parser::EMode::Relaxed) {
207 *p = ch1;
208 ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum,
209 "Missing VERSION number in VERSION line: \"%s\".", line);
210 entry->drop = 1;
211 }
212 return;
213 }
214
215 for(r = q + 1; *r >= '0' && *r <= '9';)
216 r++;
217 if(*r != '\0')
218 {
219 if (mode != Parser::EMode::Relaxed) {
220 *p = ch1;
221 ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum,
222 "Incorrect VERSION number in VERSION line: \"%s\".", line);
223 entry->drop = 1;
224 }
225 return;
226 }
227 ch = *q;
228 *q = '\0';
229 if(entry->acnum == NULL || StringCmp(entry->acnum, line) != 0)
230 {
231 *q = ch;
232 *p = ch1;
233 if (mode != Parser::EMode::Relaxed) {
234 ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch,
235 "Accessions in VERSION and ACCESSION lines don't match: \"%s\" vs \"%s\".",
236 line, (entry->acnum == NULL) ? "NULL" : entry->acnum);
237 entry->drop = 1;
238 }
239 return;
240 }
241 entry->vernum = atoi(q + 1);
242 *q = ch;
243
244 if(entry->vernum < 1)
245 {
246 *p = ch1;
247 ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion,
248 "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer.",
249 entry->vernum, entry->acnum, entry->vernum);
250 entry->drop = 1;
251 return;
252 }
253
254 if(ch1 != '\0')
255 for(*p++ = ch1; *p == ' ' || *p == '\t';)
256 p++;
257
258 if(source == Parser::ESource::DDBJ)
259 {
260 if(*p != '\0' && !ign_toks)
261 {
262 ErrPostEx(SEV_ERROR, ERR_VERSION_BadVersionLine,
263 "DDBJ's VERSION line has too many tokens: \"%s\".", line);
264 }
265 return;
266 }
267
268 if(!gi)
269 return;
270
271 if(StringNCmp(p, "GI:", 3) != 0)
272 {
273 ErrPostEx(SEV_FATAL, ERR_VERSION_IncorrectGIInVersion,
274 "Incorrect GI entry in VERSION line: \"%s\".", line);
275 entry->drop = 1;
276 return;
277 }
278 p += 3;
279 for(q = p; *q >= '0' && *q <= '9';)
280 q++;
281 if(*q != '\0')
282 {
283 ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitGI,
284 "Incorrect GI number in VERSION line: \"%s\".", line);
285 entry->drop = 1;
286 }
287 }
288
289 /**********************************************************/
fta_check_mga_line(char * line,IndexblkPtr ibp)290 static bool fta_check_mga_line(char* line, IndexblkPtr ibp)
291 {
292 char* p;
293 char* q;
294 char* str;
295 Int4 from;
296 Int4 to;
297
298 if(line == NULL || ibp == NULL)
299 return false;
300
301 for(p = line; *p == ' ' || *p == '\t';)
302 p++;
303 str = StringSave(p);
304 p = StringChr(str, '\n');
305 if(p != NULL)
306 *p = '\0';
307 p = StringChr(str, '-');
308 if(p == NULL)
309 {
310 MemFree(str);
311 return false;
312 }
313 *p++ = '\0';
314
315 if(StringLen(str) != 12 || StringLen(p) != 12 ||
316 StringNCmp(str, ibp->acnum, 5) != 0 ||
317 StringNCmp(p, ibp->acnum, 5) != 0)
318 {
319 MemFree(str);
320 return false;
321 }
322
323 for(q = str + 5; *q >= '0' && *q <= '9';)
324 q++;
325 if(*q != '\0')
326 {
327 MemFree(str);
328 return false;
329 }
330 for(q = p + 5; *q >= '0' && *q <= '9';)
331 q++;
332 if(*q != '\0')
333 {
334 MemFree(str);
335 return false;
336 }
337
338 for(q = str + 5; *q == '0';)
339 q++;
340 from = atoi(q);
341 for(q = p + 5; *q == '0';)
342 q++;
343 to = atoi(q);
344
345 if(from > to)
346 {
347 MemFree(str);
348 return false;
349 }
350
351 ibp->bases = to - from + 1;
352 MemFree(str);
353 return true;
354 }
355
356
357
358 /**********************************************************/
GenBankIndex(ParserPtr pp)359 bool GenBankIndex(ParserPtr pp)
360 {
361 FinfoBlkPtr finfo;
362
363 bool acwflag;
364 bool end_of_file;
365 bool after_LOCUS;
366 bool after_DEFNTN;
367 bool after_SOURCE;
368 bool after_REFER;
369 bool after_FEAT;
370 bool after_ORIGIN;
371 bool after_COMMENT;
372 bool after_VERSION;
373 bool after_MGA;
374
375 IndexblkPtr entry;
376 Int2 currentKeyword;
377 Int4 indx = 0;
378 DataBlkPtr data = NULL;
379 IndBlkNextPtr ibnp;
380 IndBlkNextPtr tibnp;
381 char* p;
382 char* q;
383 char* line_ver;
384 char* line_nid;
385 char* line_locus;
386 size_t i;
387 ValNodePtr kwds;
388 ValNodePtr tkwds;
389 ValNodePtr dbl;
390 ValNodePtr tdbl;
391
392 finfo = (FinfoBlkPtr) MemNew(sizeof(FinfoBlk));
393
394 end_of_file = SkipTitleBuf(pp->ffbuf, finfo, "LOCUS");
395
396 if(end_of_file)
397 {
398 MsgSkipTitleFail((char*) "GenBank", finfo);
399 return false;
400 }
401
402 bool tpa_check = (pp->source == Parser::ESource::EMBL);
403
404 ibnp = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
405 ibnp->next = NULL;
406 tibnp = ibnp;
407
408 pp->num_drop = 0;
409 kwds = NULL;
410 dbl = NULL;
411 while (!end_of_file)
412 {
413 entry = InitialEntry(pp, finfo);
414 if(entry != NULL)
415 {
416 pp->curindx = indx;
417 tibnp->next = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
418 tibnp = tibnp->next;
419 tibnp->ibp = entry;
420 tibnp->next = NULL;
421
422 indx++;
423
424 entry->is_contig = false;
425 entry->origin = false;
426 entry->is_mga = false;
427 acwflag = false;
428 after_LOCUS = false;
429 after_DEFNTN = false;
430 after_SOURCE = false;
431 after_REFER = false;
432 after_FEAT = false;
433 after_ORIGIN = false;
434 after_COMMENT = false;
435 after_VERSION = false;
436 after_MGA = false;
437
438 currentKeyword = ParFlat_LOCUS;
439 line_ver = NULL;
440 line_nid = NULL;
441 line_locus = NULL;
442 if(kwds != NULL)
443 kwds = ValNodeFreeData(kwds);
444 tkwds = NULL;
445 size_t kwds_len = 0;
446 if(dbl != NULL)
447 dbl = ValNodeFreeData(dbl);
448 tdbl = NULL;
449 size_t dbl_len = 0;
450 while(currentKeyword != ParFlat_END && !end_of_file)
451 {
452 switch(currentKeyword)
453 {
454 case ParFlat_LOCUS:
455 if(after_LOCUS)
456 {
457 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
458 "More than two lines LOCUS in one entry");
459 entry->drop = 1;
460 }
461 else
462 {
463 after_LOCUS = true;
464 line_locus = StringSave(finfo->str);
465 }
466 break;
467 case ParFlat_COMMENT:
468 if(after_COMMENT)
469 {
470 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
471 "Multiple COMMENT lines in one entry");
472 entry->drop = 1;
473 }
474 else
475 after_COMMENT = true;
476
477 break;
478 case ParFlat_VERSION:
479 p = StringStr(finfo->str + ParFlat_COL_DATA, "GI:");
480 if(p != NULL && atol(p + 3) > 0)
481 entry->wgs_and_gi |= 01;
482 if(pp->accver == false)
483 break;
484 if(after_VERSION)
485 {
486 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
487 "Multiple VERSION lines in one entry");
488 entry->drop = 1;
489 break;
490 }
491 after_VERSION = true;
492 p = finfo->str + ParFlat_COL_DATA;
493 while(*p == ' ' || *p == '\t')
494 p++;
495 for(q = p; *q != '\0' && *q != '\n';)
496 q++;
497 while(q > p)
498 {
499 q--;
500 if(*q != ' ' && *q != '\t')
501 {
502 q++;
503 break;
504 }
505 }
506 i = q - p;
507 line_ver = (char*) MemNew(i + 1);
508 StringNCpy(line_ver, p, i);
509 line_ver[i] = '\0';
510 break;
511 case ParFlat_NCBI_GI:
512 if(pp->source == Parser::ESource::DDBJ || pp->accver == false ||
513 line_nid != NULL)
514 break;
515 p = finfo->str + ParFlat_COL_DATA;
516 while(*p == ' ' || *p == '\t')
517 p++;
518 for(q = p; *q != '\0' && *q != ' ' && *q != '\t' &&
519 *q != '\n';)
520 q++;
521 i = q - p;
522 line_nid = (char*) MemNew(i + 1);
523 StringNCpy(line_nid, p, i);
524 line_nid[i] = '\0';
525 break;
526 case ParFlat_DEFINITION:
527 if(after_DEFNTN)
528 {
529 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
530 "More than two lines 'DEFINITION'");
531 entry->drop = 1;
532 }
533 else if(after_LOCUS == false)
534 {
535 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
536 "DEFINITION field out of order");
537 entry->drop = 1;
538 }
539 else
540 after_DEFNTN = true;
541
542 break;
543 case ParFlat_SOURCE:
544 if(after_SOURCE)
545 {
546 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
547 "More than two lines 'SOURCE'");
548 entry->drop = 1;
549 }
550 else if(after_LOCUS == false || after_DEFNTN == false)
551 {
552 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
553 "SOURCE field out of order");
554 entry->drop = 1;
555 }
556 else
557 after_SOURCE = true;
558
559 break;
560 case ParFlat_REFERENCE:
561 after_REFER = true;
562 break;
563 case ParFlat_CONTIG:
564 if(entry->is_contig)
565 {
566 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
567 "More than one line CONTIG in one entry");
568 entry->drop = 1;
569 }
570 else
571 entry->is_contig = true;
572 break;
573 case ParFlat_MGA:
574 if(entry->is_mga == false)
575 {
576 ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
577 "Line type \"MGA\" is allowed for CAGE records only. Entry dropped.");
578 entry->drop = 1;
579 }
580 if(fta_check_mga_line(finfo->str + ParFlat_COL_DATA, entry) == false)
581 {
582 ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectMGALine,
583 "Incorrect range of accessions supplied in MGA line of CAGE record. Entry dropped.");
584 entry->drop = 1;
585 }
586 after_MGA = true;
587 break;
588 case ParFlat_FEATURES:
589 if(after_FEAT)
590 {
591 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
592 "More than two lines 'FEATURES'");
593 entry->drop = 1;
594 }
595 else if(pp->mode != Parser::EMode::Relaxed &&
596 (after_LOCUS == false ||
597 after_DEFNTN == false ||
598 after_SOURCE == false))
599 {
600 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
601 "FEATURES field out of order");
602 entry->drop = 1;
603 }
604 else
605 after_FEAT = true;
606
607 break;
608 case ParFlat_ORIGIN:
609 if(after_ORIGIN)
610 {
611 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
612 "More than two lines 'ORIGIN'");
613 entry->drop = 1;
614 }
615 else if(
616 pp->mode != Parser::EMode::Relaxed &&
617 (after_LOCUS == false ||
618 after_DEFNTN == false ||
619 after_SOURCE == false ||
620 after_FEAT == false))
621 {
622 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
623 "ORIGIN field out of order");
624 entry->drop = 1;
625 }
626 else
627 {
628 after_ORIGIN = true;
629 entry->origin = true;
630 }
631 break;
632 case ParFlat_ACCESSION:
633 if(acwflag == false) /* first accession line */
634 {
635 acwflag = true;
636 if (!GetAccession(pp, finfo->str, entry, 2)) {
637 if (pp->mode != Parser::EMode::Relaxed) {
638 pp->num_drop++;
639 }
640 }
641 }
642 break;
643 case ParFlat_SEGMENT:
644 // LCOV_EXCL_START
645 // Excluded per Mark's request on 12/14/2016
646 GetSegment(finfo->str, entry);
647 // LCOV_EXCL_STOP
648 break;
649 case ParFlat_USER:
650 if(pp->source != Parser::ESource::Flybase)
651 {
652 ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
653 "Line type \"USER\" is allowed for source \"FLYBASE\" only. Entry dropped.");
654 entry->drop = 1;
655 }
656 break;
657 case ParFlat_PRIMARY:
658 if(entry->is_tpa == false &&
659 entry->tsa_allowed == false &&
660 pp->source != Parser::ESource::Refseq)
661 {
662 ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
663 "Line type \"PRIMARY\" is allowed for TPA or TSA records only. Continue anyway.");
664 }
665 break;
666 case ParFlat_KEYWORDS:
667 if(pp->source != Parser::ESource::DDBJ &&
668 pp->source != Parser::ESource::EMBL)
669 break;
670 if(kwds != NULL)
671 kwds = ValNodeFreeData(kwds);
672 kwds = ConstructValNode(NULL, 0,
673 StringSave(finfo->str + 8));
674 tkwds = kwds;
675 kwds_len = StringLen(finfo->str) - 8;
676 break;
677 case ParFlat_DBLINK:
678 if(dbl != NULL)
679 dbl = ValNodeFreeData(dbl);
680 dbl = ConstructValNode(NULL, 0,
681 StringSave(finfo->str + 8));
682 tdbl = dbl;
683 dbl_len = StringLen(finfo->str) - 8;
684 break;
685 default:
686 break;
687 } /* switch */
688
689 end_of_file = XReadFileBuf(pp->ffbuf, finfo);
690
691 while (!end_of_file && (finfo->str[0] == ' ' || finfo->str[0] == '\t'))
692 {
693 if(currentKeyword == ParFlat_KEYWORDS && tkwds != NULL)
694 {
695 tkwds->next = ValNodeNew(NULL);
696 tkwds = tkwds->next;
697 tkwds->data.ptrvalue = StringSave(finfo->str);
698 kwds_len += StringLen(finfo->str);
699 }
700
701 if(currentKeyword == ParFlat_DBLINK && tdbl != NULL)
702 {
703 tdbl->next = ValNodeNew(NULL);
704 tdbl = tdbl->next;
705 tdbl->data.ptrvalue = StringSave(finfo->str);
706 dbl_len += StringLen(finfo->str);
707 }
708
709 if(currentKeyword == ParFlat_ACCESSION && entry->drop == 0 &&
710 GetAccession(pp, finfo->str, entry, 0) == false)
711 pp->num_drop++;
712
713 end_of_file = XReadFileBuf(pp->ffbuf, finfo);
714 }
715
716
717
718 if(kwds != NULL)
719 {
720 check_est_sts_gss_tpa_kwds(kwds, kwds_len, entry,
721 tpa_check, entry->specialist_db,
722 entry->inferential,
723 entry->experimental,
724 entry->assembly);
725 kwds = ValNodeFreeData(kwds);
726 kwds_len = 0;
727 }
728
729 if (pp->mode == Parser::EMode::Relaxed &&
730 NStr::IsBlank(finfo->str)) {
731 currentKeyword = ParFlat_UNKW;
732 continue;
733 }
734
735 currentKeyword = SrchKeyword(finfo->str, genbankKeywordLength);
736
737 if(finfo->str[0] != ' ' && finfo->str[0] != '\t' &&
738 CheckLineType(finfo->str, finfo->line, genbankKeywordLength, after_ORIGIN) == false)
739 entry->drop = 1;
740
741 } /* while, end of one entry */
742
743 entry->is_tpa_wgs_con = (entry->is_contig && entry->is_wgs && entry->is_tpa);
744
745 if(entry->drop != 1)
746 {
747
748 if (pp->mode != Parser::EMode::Relaxed) {
749 if(line_locus != NULL &&
750 CkLocusLinePos(line_locus, pp->source, &entry->lc, entry->is_mga) == false)
751 entry->drop = 1;
752
753 if(entry->is_mga && after_MGA == false)
754 entry->drop = gb_err_field((char*) "MGA");
755
756 if(after_LOCUS == false)
757 entry->drop = gb_err_field((char*) "LOCUS");
758
759 if(after_VERSION == false && pp->accver)
760 entry->drop = gb_err_field((char*) "VERSION");
761
762 if(after_DEFNTN == false)
763 entry->drop = gb_err_field((char*) "DEFINITION");
764
765 if(after_SOURCE == false)
766 entry->drop = gb_err_field((char*) "SOURCE");
767
768 if(after_REFER == false && pp->source != Parser::ESource::Flybase &&
769 entry->is_wgs == false &&
770 (pp->source != Parser::ESource::Refseq ||
771 StringNCmp(entry->acnum, "NW_", 3) != 0)) {
772 entry->drop = gb_err_field((char*) "REFERENCE");
773 }
774
775 if(after_FEAT == false) {
776 entry->drop = gb_err_field((char*) "FEATURES");
777 }
778 } // !Parser::EMode::Relaxed
779
780 if(entry->is_contig && entry->segnum != 0)
781 {
782 ErrPostEx(SEV_ERROR, ERR_FORMAT_ContigInSegset,
783 "CONTIG data are not allowed for members of segmented sets, entry dropped.");
784 entry->drop = 1;
785 }
786 }
787 if(pp->accver)
788 {
789 if(pp->mode == Parser::EMode::HTGSCON)
790 entry->vernum = 1;
791 else
792 ParseGenBankVersion(
793 entry,
794 line_ver,
795 line_nid,
796 pp->source,
797 pp->mode,
798 pp->ign_toks);
799 }
800 if(line_locus != NULL)
801 {
802 MemFree(line_locus);
803 line_locus = NULL;
804 }
805 if(line_ver != NULL)
806 {
807 MemFree(line_ver);
808 line_ver = NULL;
809 }
810 if(line_nid != NULL)
811 {
812 MemFree(line_nid);
813 line_nid = NULL;
814 }
815 entry->len = (size_t) (pp->ffbuf.current - pp->ffbuf.start) -
816 entry->offset;
817
818 if(acwflag == false &&
819 pp->mode != Parser::EMode::Relaxed)
820 {
821 ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
822 "No accession # for this entry, about line %ld",
823 (long int) entry->linenum);
824 }
825
826 if(dbl != NULL)
827 {
828 dbl = ValNodeFreeData(dbl);
829 dbl_len = 0;
830 }
831 } /* if, entry */
832 else
833 {
834 end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "//");
835 }
836
837 end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "LOCUS");
838
839 } /* while, end_of_file */
840
841 pp->indx = indx;
842
843 FtaDeletePrefix(PREFIX_LOCUS | PREFIX_ACCESSION);
844
845 if(pp->qsfd != NULL && QSIndex(pp, ibnp->next) == false)
846 return false;
847
848 pp->entrylist = (IndexblkPtr*) MemNew(indx * sizeof(IndexblkPtr));
849 tibnp = ibnp->next;
850 MemFree(ibnp);
851 for(int j = 0; j < indx && tibnp != NULL; j++, tibnp = ibnp)
852 {
853 pp->entrylist[j] = tibnp->ibp;
854 ibnp = tibnp->next;
855 MemFree(tibnp);
856 }
857
858 MemFree(finfo);
859
860 return(end_of_file);
861 }
862
863 END_NCBI_SCOPE
864