1 /* gb_index.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  gb_index.c
28  *
29  * Author: Karl Sirotkin, Hsiu-Chuan Chen
30  *
31  * File Description:
32  * -----------------
33  *      Parsing genbank to memory blocks. Build Genbank format index block.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 
40 #include "index.h"
41 #include "genbank.h"
42 
43 #include "ftaerr.hpp"
44 #include "indx_blk.h"
45 #include "indx_def.h"
46 #include "utilfun.h"
47 #include "entry.h"
48 
49 #ifdef THIS_FILE
50 #    undef THIS_FILE
51 #endif
52 #define THIS_FILE "gb_index.cpp"
53 
54 BEGIN_NCBI_SCOPE
55 
56 KwordBlk genbankKeywordLength[] = {
57     {"LOCUS", 5},     {"DEFINITION", 10}, {"ACCESSION", 9}, {"NID", 3},
58     {"GSDB ID", 7},   {"KEYWORDS", 8},    {"SEGMENT", 7},   {"SOURCE", 6},
59     {"REFERENCE", 9}, {"COMMENT", 7},     {"FEATURES", 8},  {"BASE COUNT", 10},
60     {"ORIGIN", 6},    {"//", 2},          {"GSDBID", 6},    {"CONTIG", 6},
61     {"VERSION", 7},   {"USER", 4},        {"WGS", 3},       {"PRIMARY", 7},
62     {"MGA", 3},       {"PROJECT", 7},     {"DBLINK", 6},    {NULL, 0}
63 };
64 
65 // LCOV_EXCL_START
66 // Excluded per Mark's request on 12/14/2016
67 /**********************************************************
68  *
69  *   static bool DelSegnum(str, segnum, len2):
70  *
71  *      Strip off segnum which has number of "len1" digits,
72  *   then check if any tailing zero existed.
73  *      Subroutine return:
74  *   TRUE if
75  *   - there is no tailing zero or
76  *   - the number of the tailing zero is equal or greater
77  *     than (len2-len1) (i.e. strip off len2-len1 of "0").
78  *   FALSE and no change in the string "str" if
79  *   - len2-len1 less than zero or
80  *   - there is not enough "len1" digits at end of
81  *     the string "str" or
82  *   - there is not enough len2-len1 zero at end of
83  *     the string "str".
84  *
85  *                                      February 25 1993
86  *
87  **********************************************************/
DelSegnum(IndexblkPtr entry,char * segnum,size_t len2)88 static bool DelSegnum(IndexblkPtr entry, char* segnum, size_t len2)
89 {
90     char* str;
91     char* p;
92     char* q;
93 
94     if(segnum == NULL)
95         return false;
96     size_t len1 = StringLen(segnum);
97     if(len2 < len1)
98         return false;
99 
100     /* check, is there enough digits to delete
101      */
102     size_t tlen = len1;
103     str = entry->blocusname;
104     size_t i = StringLen(str) - 1;
105     for(; tlen > 0 && str[i] >= '0' && str[i] <= '9'; i--)
106         tlen--;
107 
108     if(tlen != 0 || i < 0)
109         return false;
110 
111     if(len2 > len1 && str[i] == '0')
112     {
113         /* check, is there enough "0" appended
114          */
115         for(tlen = len2 - len1; tlen > 0 && str[i] == '0'; i--)
116             tlen--;
117 
118         if(tlen != 0)
119             return false;
120     }
121 
122     for(q = &str[i+1], p = q; *p == '0';)
123         p++;
124 
125     i = atoi(segnum);
126     if((size_t) atoi(p) != i)
127     {
128         ErrPostEx(SEV_REJECT, ERR_SEGMENT_BadLocusName,
129                   "Segment suffix in locus name \"%s\" does not match number in SEGMENT line = \"%d\". Entry dropped.",
130                   str, i);
131         entry->drop = 1;
132     }
133 
134     *q = '\0';                          /* strip off "len" characters */
135     return true;
136 }
137 
138 /**********************************************************/
GetSegment(char * str,IndexblkPtr entry)139 static void GetSegment(char* str, IndexblkPtr entry)
140 {
141     TokenStatBlkPtr stoken;
142     TokenBlkPtr     ptr2;
143     TokenBlkPtr     ptr4;
144 
145     stoken = TokenString(str, ' ');
146 
147     if(stoken->num > 3)
148     {
149         ptr2 = stoken->list->next;
150         ptr4 = ptr2->next->next;
151         entry->segnum = (Uint2) atoi(ptr2->str);
152 
153         if(!DelSegnum(entry, ptr2->str, StringLen(ptr4->str)))
154         {
155             ErrPostEx(SEV_ERROR, ERR_SEGMENT_BadLocusName,
156                       "Bad locus name %s in %d",
157                       entry->blocusname, entry->linenum);
158         }
159 
160         entry->segtotal = (Uint2) atoi(ptr4->str);
161     }
162     else
163     {
164         ErrPostEx(SEV_ERROR, ERR_SEGMENT_IncompSeg,
165                   "Incomplete Segment information at linenum %d",
166                   entry->linenum);
167     }
168 
169     FreeTokenstatblk(stoken);
170 }
171 // LCOV_EXCL_STOP
172 
173 /**********************************************************/
gb_err_field(char * str)174 static Uint1 gb_err_field(char* str)
175 {
176     ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
177               "No %s data in GenBank format file, entry dropped", str);
178     return(1);
179 }
180 
181 /**********************************************************/
ParseGenBankVersion(IndexblkPtr entry,char * line,char * nid,Parser::ESource source,Parser::EMode mode,bool ign_toks)182 static void ParseGenBankVersion(IndexblkPtr entry, char* line, char* nid,
183                                 Parser::ESource source,
184                                 Parser::EMode mode,
185                                 bool ign_toks)
186 {
187     bool gi;
188     char* p;
189     char* q;
190     char* r;
191     Char    ch;
192     Char    ch1;
193 
194     if(line == NULL)
195         return;
196 
197     for(p = line; *p != '\0' && *p != ' ' && *p != '\t';)
198         p++;
199     gi = (*p == '\0') ? false : true;
200 
201     ch1 = *p;
202     *p = '\0';
203     q = StringRChr(line, '.');
204     if(q == NULL)
205     {
206         if (mode != Parser::EMode::Relaxed) {
207             *p = ch1;
208             ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum,
209                     "Missing VERSION number in VERSION line: \"%s\".", line);
210             entry->drop = 1;
211         }
212         return;
213     }
214 
215     for(r = q + 1; *r >= '0' && *r <= '9';)
216         r++;
217     if(*r != '\0')
218     {
219         if (mode != Parser::EMode::Relaxed) {
220             *p = ch1;
221             ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum,
222                     "Incorrect VERSION number in VERSION line: \"%s\".", line);
223             entry->drop = 1;
224         }
225         return;
226     }
227     ch = *q;
228     *q = '\0';
229     if(entry->acnum == NULL || StringCmp(entry->acnum, line) != 0)
230     {
231         *q = ch;
232         *p = ch1;
233         if (mode != Parser::EMode::Relaxed) {
234             ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch,
235                   "Accessions in VERSION and ACCESSION lines don't match: \"%s\" vs \"%s\".",
236                   line, (entry->acnum == NULL) ? "NULL" : entry->acnum);
237             entry->drop = 1;
238         }
239         return;
240     }
241     entry->vernum = atoi(q + 1);
242     *q = ch;
243 
244     if(entry->vernum < 1)
245     {
246         *p = ch1;
247         ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion,
248                   "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer.",
249                   entry->vernum, entry->acnum, entry->vernum);
250         entry->drop = 1;
251         return;
252     }
253 
254     if(ch1 != '\0')
255         for(*p++ = ch1; *p == ' ' || *p == '\t';)
256             p++;
257 
258     if(source == Parser::ESource::DDBJ)
259     {
260         if(*p != '\0' && !ign_toks)
261         {
262             ErrPostEx(SEV_ERROR, ERR_VERSION_BadVersionLine,
263                       "DDBJ's VERSION line has too many tokens: \"%s\".", line);
264         }
265         return;
266     }
267 
268     if(!gi)
269         return;
270 
271     if(StringNCmp(p, "GI:", 3) != 0)
272     {
273         ErrPostEx(SEV_FATAL, ERR_VERSION_IncorrectGIInVersion,
274                   "Incorrect GI entry in VERSION line: \"%s\".", line);
275         entry->drop = 1;
276         return;
277     }
278     p += 3;
279     for(q = p; *q >= '0' && *q <= '9';)
280         q++;
281     if(*q != '\0')
282     {
283         ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitGI,
284                   "Incorrect GI number in VERSION line: \"%s\".", line);
285         entry->drop = 1;
286     }
287 }
288 
289 /**********************************************************/
fta_check_mga_line(char * line,IndexblkPtr ibp)290 static bool fta_check_mga_line(char* line, IndexblkPtr ibp)
291 {
292     char* p;
293     char* q;
294     char* str;
295     Int4    from;
296     Int4    to;
297 
298     if(line == NULL || ibp == NULL)
299         return false;
300 
301     for(p = line; *p == ' ' || *p == '\t';)
302         p++;
303     str = StringSave(p);
304     p = StringChr(str, '\n');
305     if(p != NULL)
306         *p = '\0';
307     p = StringChr(str, '-');
308     if(p == NULL)
309     {
310         MemFree(str);
311         return false;
312     }
313     *p++ = '\0';
314 
315     if(StringLen(str) != 12 || StringLen(p) != 12 ||
316        StringNCmp(str, ibp->acnum, 5) != 0 ||
317        StringNCmp(p, ibp->acnum, 5) != 0)
318     {
319         MemFree(str);
320         return false;
321     }
322 
323     for(q = str + 5; *q >= '0' && *q <= '9';)
324         q++;
325     if(*q != '\0')
326     {
327         MemFree(str);
328         return false;
329     }
330     for(q = p + 5; *q >= '0' && *q <= '9';)
331         q++;
332     if(*q != '\0')
333     {
334         MemFree(str);
335         return false;
336     }
337 
338     for(q = str + 5; *q == '0';)
339         q++;
340     from = atoi(q);
341     for(q = p + 5; *q == '0';)
342         q++;
343     to = atoi(q);
344 
345     if(from > to)
346     {
347         MemFree(str);
348         return false;
349     }
350 
351     ibp->bases = to - from + 1;
352     MemFree(str);
353     return true;
354 }
355 
356 
357 
358 /**********************************************************/
GenBankIndex(ParserPtr pp)359 bool GenBankIndex(ParserPtr pp)
360 {
361     FinfoBlkPtr   finfo;
362 
363     bool          acwflag;
364     bool          end_of_file;
365     bool          after_LOCUS;
366     bool          after_DEFNTN;
367     bool          after_SOURCE;
368     bool          after_REFER;
369     bool          after_FEAT;
370     bool          after_ORIGIN;
371     bool          after_COMMENT;
372     bool          after_VERSION;
373     bool          after_MGA;
374 
375     IndexblkPtr   entry;
376     Int2          currentKeyword;
377     Int4          indx = 0;
378     DataBlkPtr    data = NULL;
379     IndBlkNextPtr ibnp;
380     IndBlkNextPtr tibnp;
381     char*       p;
382     char*       q;
383     char*       line_ver;
384     char*       line_nid;
385     char*       line_locus;
386     size_t        i;
387     ValNodePtr    kwds;
388     ValNodePtr    tkwds;
389     ValNodePtr    dbl;
390     ValNodePtr    tdbl;
391 
392     finfo = (FinfoBlkPtr) MemNew(sizeof(FinfoBlk));
393 
394     end_of_file = SkipTitleBuf(pp->ffbuf, finfo, "LOCUS");
395 
396     if(end_of_file)
397     {
398         MsgSkipTitleFail((char*) "GenBank", finfo);
399         return false;
400     }
401 
402     bool tpa_check = (pp->source == Parser::ESource::EMBL);
403 
404     ibnp = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
405     ibnp->next = NULL;
406     tibnp = ibnp;
407 
408     pp->num_drop = 0;
409     kwds = NULL;
410     dbl = NULL;
411     while (!end_of_file)
412     {
413         entry = InitialEntry(pp, finfo);
414         if(entry != NULL)
415         {
416             pp->curindx = indx;
417             tibnp->next = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
418             tibnp = tibnp->next;
419             tibnp->ibp = entry;
420             tibnp->next = NULL;
421 
422             indx++;
423 
424             entry->is_contig = false;
425             entry->origin = false;
426             entry->is_mga = false;
427             acwflag = false;
428             after_LOCUS = false;
429             after_DEFNTN = false;
430             after_SOURCE = false;
431             after_REFER = false;
432             after_FEAT = false;
433             after_ORIGIN = false;
434             after_COMMENT = false;
435             after_VERSION = false;
436             after_MGA = false;
437 
438             currentKeyword = ParFlat_LOCUS;
439             line_ver = NULL;
440             line_nid = NULL;
441             line_locus = NULL;
442             if(kwds != NULL)
443                 kwds = ValNodeFreeData(kwds);
444             tkwds = NULL;
445             size_t kwds_len = 0;
446             if(dbl != NULL)
447                 dbl = ValNodeFreeData(dbl);
448             tdbl = NULL;
449             size_t dbl_len = 0;
450             while(currentKeyword != ParFlat_END && !end_of_file)
451             {
452                 switch(currentKeyword)
453                 {
454                     case ParFlat_LOCUS:
455                         if(after_LOCUS)
456                         {
457                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
458                                       "More than two lines LOCUS in one entry");
459                             entry->drop = 1;
460                         }
461                         else
462                         {
463                             after_LOCUS = true;
464                             line_locus = StringSave(finfo->str);
465                         }
466                         break;
467                     case ParFlat_COMMENT:
468                         if(after_COMMENT)
469                         {
470                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
471                                       "Multiple COMMENT lines in one entry");
472                             entry->drop = 1;
473                         }
474                         else
475                             after_COMMENT = true;
476 
477                         break;
478                     case ParFlat_VERSION:
479                         p = StringStr(finfo->str + ParFlat_COL_DATA, "GI:");
480                         if(p != NULL && atol(p + 3) > 0)
481                             entry->wgs_and_gi |= 01;
482                         if(pp->accver == false)
483                             break;
484                         if(after_VERSION)
485                         {
486                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
487                                       "Multiple VERSION lines in one entry");
488                             entry->drop = 1;
489                             break;
490                         }
491                         after_VERSION = true;
492                         p = finfo->str + ParFlat_COL_DATA;
493                         while(*p == ' ' || *p == '\t')
494                             p++;
495                         for(q = p; *q != '\0' && *q != '\n';)
496                             q++;
497                         while(q > p)
498                         {
499                             q--;
500                             if(*q != ' ' && *q != '\t')
501                             {
502                                 q++;
503                                 break;
504                             }
505                         }
506                         i = q - p;
507                         line_ver = (char*) MemNew(i + 1);
508                         StringNCpy(line_ver, p, i);
509                         line_ver[i] = '\0';
510                         break;
511                     case ParFlat_NCBI_GI:
512                         if(pp->source == Parser::ESource::DDBJ || pp->accver == false ||
513                            line_nid != NULL)
514                             break;
515                         p = finfo->str + ParFlat_COL_DATA;
516                         while(*p == ' ' || *p == '\t')
517                             p++;
518                         for(q = p; *q != '\0' && *q != ' ' && *q != '\t' &&
519                                    *q != '\n';)
520                             q++;
521                         i = q - p;
522                         line_nid = (char*) MemNew(i + 1);
523                         StringNCpy(line_nid, p, i);
524                         line_nid[i] = '\0';
525                         break;
526                     case ParFlat_DEFINITION:
527                         if(after_DEFNTN)
528                         {
529                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
530                                       "More than two lines 'DEFINITION'");
531                             entry->drop = 1;
532                         }
533                         else if(after_LOCUS == false)
534                         {
535                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
536                                       "DEFINITION field out of order");
537                             entry->drop = 1;
538                         }
539                         else
540                             after_DEFNTN = true;
541 
542                         break;
543                     case ParFlat_SOURCE:
544                         if(after_SOURCE)
545                         {
546                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
547                                       "More than two lines 'SOURCE'");
548                             entry->drop = 1;
549                         }
550                         else if(after_LOCUS == false || after_DEFNTN == false)
551                         {
552                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
553                                       "SOURCE field out of order");
554                             entry->drop = 1;
555                         }
556                         else
557                             after_SOURCE = true;
558 
559                         break;
560                     case ParFlat_REFERENCE:
561                         after_REFER = true;
562                         break;
563                     case ParFlat_CONTIG:
564                         if(entry->is_contig)
565                         {
566                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
567                                       "More than one line CONTIG in one entry");
568                             entry->drop = 1;
569                         }
570                         else
571                             entry->is_contig = true;
572                         break;
573                     case ParFlat_MGA:
574                         if(entry->is_mga == false)
575                         {
576                             ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
577                                       "Line type \"MGA\" is allowed for CAGE records only. Entry dropped.");
578                             entry->drop = 1;
579                         }
580                         if(fta_check_mga_line(finfo->str + ParFlat_COL_DATA, entry) == false)
581                         {
582                             ErrPostEx(SEV_REJECT, ERR_FORMAT_IncorrectMGALine,
583                                       "Incorrect range of accessions supplied in MGA line of CAGE record. Entry dropped.");
584                             entry->drop = 1;
585                         }
586                         after_MGA = true;
587                         break;
588                     case ParFlat_FEATURES:
589                         if(after_FEAT)
590                         {
591                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
592                                       "More than two lines 'FEATURES'");
593                             entry->drop = 1;
594                         }
595                         else if(pp->mode != Parser::EMode::Relaxed &&
596                                 (after_LOCUS == false ||
597                                 after_DEFNTN == false ||
598                                 after_SOURCE == false))
599                         {
600                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
601                                       "FEATURES field out of order");
602                             entry->drop = 1;
603                         }
604                         else
605                             after_FEAT = true;
606 
607                         break;
608                     case ParFlat_ORIGIN:
609                         if(after_ORIGIN)
610                         {
611                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
612                                       "More than two lines 'ORIGIN'");
613                             entry->drop = 1;
614                         }
615                         else if(
616                                 pp->mode != Parser::EMode::Relaxed &&
617                                 (after_LOCUS == false ||
618                                 after_DEFNTN == false ||
619                                 after_SOURCE == false ||
620                                 after_FEAT == false))
621                         {
622                             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
623                                       "ORIGIN field out of order");
624                             entry->drop = 1;
625                         }
626                         else
627                         {
628                             after_ORIGIN = true;
629                             entry->origin = true;
630                         }
631                         break;
632                     case ParFlat_ACCESSION:
633                         if(acwflag == false)    /* first accession line */
634                         {
635                             acwflag = true;
636                             if (!GetAccession(pp, finfo->str, entry, 2)) {
637                                 if (pp->mode != Parser::EMode::Relaxed) {
638                                     pp->num_drop++;
639                                 }
640                             }
641                         }
642                         break;
643                     case ParFlat_SEGMENT:
644 // LCOV_EXCL_START
645 // Excluded per Mark's request on 12/14/2016
646                         GetSegment(finfo->str, entry);
647 // LCOV_EXCL_STOP
648                         break;
649                     case ParFlat_USER:
650                         if(pp->source != Parser::ESource::Flybase)
651                         {
652                             ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
653                                       "Line type \"USER\" is allowed for source \"FLYBASE\" only. Entry dropped.");
654                             entry->drop = 1;
655                         }
656                         break;
657                     case ParFlat_PRIMARY:
658                         if(entry->is_tpa == false &&
659                            entry->tsa_allowed == false &&
660                            pp->source != Parser::ESource::Refseq)
661                         {
662                             ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
663                                       "Line type \"PRIMARY\" is allowed for TPA or TSA records only. Continue anyway.");
664                         }
665                         break;
666                     case ParFlat_KEYWORDS:
667                         if(pp->source != Parser::ESource::DDBJ &&
668                            pp->source != Parser::ESource::EMBL)
669                             break;
670                         if(kwds != NULL)
671                             kwds = ValNodeFreeData(kwds);
672                         kwds = ConstructValNode(NULL, 0,
673                                                 StringSave(finfo->str + 8));
674                         tkwds = kwds;
675                         kwds_len = StringLen(finfo->str) - 8;
676                         break;
677                     case ParFlat_DBLINK:
678                         if(dbl != NULL)
679                             dbl = ValNodeFreeData(dbl);
680                         dbl = ConstructValNode(NULL, 0,
681                                                 StringSave(finfo->str + 8));
682                         tdbl = dbl;
683                         dbl_len = StringLen(finfo->str) - 8;
684                         break;
685                     default:
686                         break;
687                 } /* switch */
688 
689                 end_of_file = XReadFileBuf(pp->ffbuf, finfo);
690 
691                 while (!end_of_file && (finfo->str[0] == ' ' || finfo->str[0] == '\t'))
692                 {
693                     if(currentKeyword == ParFlat_KEYWORDS && tkwds != NULL)
694                     {
695                         tkwds->next = ValNodeNew(NULL);
696                         tkwds = tkwds->next;
697                         tkwds->data.ptrvalue = StringSave(finfo->str);
698                         kwds_len += StringLen(finfo->str);
699                     }
700 
701                     if(currentKeyword == ParFlat_DBLINK && tdbl != NULL)
702                     {
703                         tdbl->next = ValNodeNew(NULL);
704                         tdbl = tdbl->next;
705                         tdbl->data.ptrvalue = StringSave(finfo->str);
706                         dbl_len += StringLen(finfo->str);
707                     }
708 
709                     if(currentKeyword == ParFlat_ACCESSION && entry->drop == 0 &&
710                        GetAccession(pp, finfo->str, entry, 0) == false)
711                         pp->num_drop++;
712 
713                     end_of_file = XReadFileBuf(pp->ffbuf, finfo);
714                 }
715 
716 
717 
718                 if(kwds != NULL)
719                 {
720                     check_est_sts_gss_tpa_kwds(kwds, kwds_len, entry,
721                                                tpa_check, entry->specialist_db,
722                                                entry->inferential,
723                                                entry->experimental,
724                                                entry->assembly);
725                     kwds = ValNodeFreeData(kwds);
726                     kwds_len = 0;
727                 }
728 
729                 if (pp->mode == Parser::EMode::Relaxed &&
730                     NStr::IsBlank(finfo->str)) {
731                     currentKeyword = ParFlat_UNKW;
732                     continue;
733                 }
734 
735                 currentKeyword = SrchKeyword(finfo->str, genbankKeywordLength);
736 
737                 if(finfo->str[0] != ' ' && finfo->str[0] != '\t' &&
738                    CheckLineType(finfo->str, finfo->line, genbankKeywordLength, after_ORIGIN) == false)
739                      entry->drop = 1;
740 
741             } /* while, end of one entry */
742 
743             entry->is_tpa_wgs_con = (entry->is_contig && entry->is_wgs && entry->is_tpa);
744 
745             if(entry->drop != 1)
746             {
747 
748                 if (pp->mode != Parser::EMode::Relaxed) {
749                     if(line_locus != NULL &&
750                     CkLocusLinePos(line_locus, pp->source, &entry->lc, entry->is_mga) == false)
751                         entry->drop = 1;
752 
753                     if(entry->is_mga && after_MGA == false)
754                         entry->drop = gb_err_field((char*) "MGA");
755 
756                     if(after_LOCUS == false)
757                         entry->drop = gb_err_field((char*) "LOCUS");
758 
759                     if(after_VERSION == false && pp->accver)
760                         entry->drop = gb_err_field((char*) "VERSION");
761 
762                     if(after_DEFNTN == false)
763                         entry->drop = gb_err_field((char*) "DEFINITION");
764 
765                     if(after_SOURCE == false)
766                         entry->drop = gb_err_field((char*) "SOURCE");
767 
768                     if(after_REFER == false && pp->source != Parser::ESource::Flybase &&
769                        entry->is_wgs == false &&
770                        (pp->source != Parser::ESource::Refseq ||
771                         StringNCmp(entry->acnum, "NW_", 3) != 0)) {
772                             entry->drop = gb_err_field((char*) "REFERENCE");
773                     }
774 
775                     if(after_FEAT == false) {
776                         entry->drop = gb_err_field((char*) "FEATURES");
777                     }
778                 } // !Parser::EMode::Relaxed
779 
780                 if(entry->is_contig && entry->segnum != 0)
781                 {
782                     ErrPostEx(SEV_ERROR, ERR_FORMAT_ContigInSegset,
783                               "CONTIG data are not allowed for members of segmented sets, entry dropped.");
784                     entry->drop = 1;
785                 }
786             }
787             if(pp->accver)
788             {
789                 if(pp->mode == Parser::EMode::HTGSCON)
790                     entry->vernum = 1;
791                 else
792                     ParseGenBankVersion(
793                             entry,
794                             line_ver,
795                             line_nid,
796                             pp->source,
797                             pp->mode,
798                             pp->ign_toks);
799             }
800             if(line_locus != NULL)
801             {
802                 MemFree(line_locus);
803                 line_locus = NULL;
804             }
805             if(line_ver != NULL)
806             {
807                 MemFree(line_ver);
808                 line_ver = NULL;
809             }
810             if(line_nid != NULL)
811             {
812                 MemFree(line_nid);
813                 line_nid = NULL;
814             }
815             entry->len = (size_t) (pp->ffbuf.current - pp->ffbuf.start) -
816                              entry->offset;
817 
818             if(acwflag == false &&
819                pp->mode != Parser::EMode::Relaxed)
820             {
821                 ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
822                           "No accession # for this entry, about line %ld",
823                           (long int) entry->linenum);
824             }
825 
826             if(dbl != NULL)
827             {
828                 dbl = ValNodeFreeData(dbl);
829                 dbl_len = 0;
830             }
831         } /* if, entry */
832         else
833         {
834             end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "//");
835         }
836 
837         end_of_file = FindNextEntryBuf(end_of_file, pp->ffbuf, finfo, "LOCUS");
838 
839     } /* while, end_of_file */
840 
841     pp->indx = indx;
842 
843     FtaDeletePrefix(PREFIX_LOCUS | PREFIX_ACCESSION);
844 
845     if(pp->qsfd != NULL && QSIndex(pp, ibnp->next) == false)
846         return false;
847 
848     pp->entrylist = (IndexblkPtr*) MemNew(indx * sizeof(IndexblkPtr));
849     tibnp = ibnp->next;
850     MemFree(ibnp);
851     for(int j = 0; j < indx && tibnp != NULL; j++, tibnp = ibnp)
852     {
853         pp->entrylist[j] = tibnp->ibp;
854         ibnp = tibnp->next;
855         MemFree(tibnp);
856     }
857 
858     MemFree(finfo);
859 
860     return(end_of_file);
861 }
862 
863 END_NCBI_SCOPE
864