1 /* xm_index.c
2  *
3  * ===========================================================================
4  *
5  *                            PUBLIC DOMAIN NOTICE
6  *               National Center for Biotechnology Information
7  *
8  *  This software/database is a "United States Government Work" under the
9  *  terms of the United States Copyright Act.  It was written as part of
10  *  the author's official duties as a United States Government employee and
11  *  thus cannot be copyrighted.  This software/database is freely available
12  *  to the public for use. The National Library of Medicine and the U.S.
13  *  Government have not placed any restriction on its use or reproduction.
14  *
15  *  Although all reasonable efforts have been taken to ensure the accuracy
16  *  and reliability of the software and data, the NLM and the U.S.
17  *  Government do not and cannot warrant the performance or results that
18  *  may be obtained by using this software or data. The NLM and the U.S.
19  *  Government disclaim all warranties, express or implied, including
20  *  warranties of performance, merchantability or fitness for any particular
21  *  purpose.
22  *
23  *  Please cite the author in any work or product based on this material.
24  *
25  * ===========================================================================
26  *
27  * File Name:  xm_index.c
28  *
29  * Author: Sergey Bazhin
30  *
31  * File Description:
32  * -----------------
33  *      Parsing flat records to memory blocks in XML format.
34  *
35  */
36 #include <ncbi_pch.hpp>
37 
38 #include "ftacpp.hpp"
39 #include "index.h"
40 
41 #include "ftaerr.hpp"
42 #include "indx_blk.h"
43 #include "indx_def.h"
44 #include "utilfun.h"
45 #include "fta_xml.h"
46 
47 #ifdef THIS_FILE
48 #    undef THIS_FILE
49 #endif
50 #define THIS_FILE "xm_index.cpp"
51 
52 #define XML_FAKE_ACC_TAG "AC   "
53 
54 BEGIN_NCBI_SCOPE
55 
56 typedef struct _XmlKwordBlk {
57     const char *str;
58     Int4       order;
59     Int4       tag;
60 } XmlKwordBlk, *XmlKwordBlkPtr;
61 
62 XmlKwordBlk xmkwl[] = {
63     {"<INSDSeq_locus>",                 1, INSDSEQ_LOCUS},
64     {"<INSDSeq_length>",                2, INSDSEQ_LENGTH},
65     {"<INSDSeq_strandedness>",          3, INSDSEQ_STRANDEDNESS},
66     {"<INSDSeq_moltype>",               4, INSDSEQ_MOLTYPE},
67     {"<INSDSeq_topology>",              5, INSDSEQ_TOPOLOGY},
68     {"<INSDSeq_division>",              6, INSDSEQ_DIVISION},
69     {"<INSDSeq_update-date>",           7, INSDSEQ_UPDATE_DATE},
70     {"<INSDSeq_create-date>",           8, INSDSEQ_CREATE_DATE},
71     {"<INSDSeq_update-release>",        9, INSDSEQ_UPDATE_RELEASE},
72     {"<INSDSeq_create-release>",       10, INSDSEQ_CREATE_RELEASE},
73     {"<INSDSeq_definition>",           11, INSDSEQ_DEFINITION},
74     {"<INSDSeq_primary-accession>",    12, INSDSEQ_PRIMARY_ACCESSION},
75     {"<INSDSeq_entry-version>",        13, INSDSEQ_ENTRY_VERSION},
76     {"<INSDSeq_accession-version>",    14, INSDSEQ_ACCESSION_VERSION},
77     {"<INSDSeq_other-seqids>",         15, INSDSEQ_OTHER_SEQIDS},
78     {"<INSDSeq_secondary-accessions>", 16, INSDSEQ_SECONDARY_ACCESSIONS},
79     {"<INSDSeq_keywords>",             17, INSDSEQ_KEYWORDS},
80     {"<INSDSeq_segment>",              18, INSDSEQ_SEGMENT},
81     {"<INSDSeq_source>",               19, INSDSEQ_SOURCE},
82     {"<INSDSeq_organism>",             20, INSDSEQ_ORGANISM},
83     {"<INSDSeq_taxonomy>",             21, INSDSEQ_TAXONOMY},
84     {"<INSDSeq_references>",           22, INSDSEQ_REFERENCES},
85     {"<INSDSeq_comment>",              23, INSDSEQ_COMMENT},
86     {"<INSDSeq_primary>",              24, INSDSEQ_PRIMARY},
87     {"<INSDSeq_source-db>",            25, INSDSEQ_SOURCE_DB},
88     {"<INSDSeq_database-reference>",   26, INSDSEQ_DATABASE_REFERENCE},
89     {"<INSDSeq_feature-table>",        27, INSDSEQ_FEATURE_TABLE},
90     {"<INSDSeq_sequence>",             28, INSDSEQ_SEQUENCE},
91     {"<INSDSeq_contig>",               29, INSDSEQ_CONTIG},
92     {NULL,                             -1, -1}
93 };
94 
95 XmlKwordBlk xmfeatkwl[] = {
96     {"<INSDFeature_key>",               1, INSDFEATURE_KEY},
97     {"<INSDFeature_location>",          2, INSDFEATURE_LOCATION},
98     {"<INSDFeature_intervals>",         3, INSDFEATURE_INTERVALS},
99     {"<INSDFeature_quals>",             4, INSDFEATURE_QUALS},
100     {NULL,                             -1, -1}
101 };
102 
103 XmlKwordBlk xmintkwl[] = {
104     {"<INSDInterval_from>",             1, INSDINTERVAL_FROM},
105     {"<INSDInterval_to>",               2, INSDINTERVAL_TO},
106     {"<INSDInterval_point>",            3, INSDINTERVAL_POINT},
107     {"<INSDInterval_accession>",        4, INSDINTERVAL_ACCESSION},
108     {NULL,                             -1, -1}
109 };
110 
111 XmlKwordBlk xmrefkwl[] = {
112     {"<INSDReference_reference>",       1, INSDREFERENCE_REFERENCE},
113     {"<INSDReference_position>",        2, INSDREFERENCE_POSITION},
114     {"<INSDReference_authors>",         3, INSDREFERENCE_AUTHORS},
115     {"<INSDReference_consortium>",      4, INSDREFERENCE_CONSORTIUM},
116     {"<INSDReference_title>",           5, INSDREFERENCE_TITLE},
117     {"<INSDReference_journal>",         6, INSDREFERENCE_JOURNAL},
118     {"<INSDReference_xref>",            7, INSDREFERENCE_XREF},
119     {"<INSDReference_medline>",         8, INSDREFERENCE_MEDLINE},
120     {"<INSDReference_pubmed>",          9, INSDREFERENCE_PUBMED},
121     {"<INSDReference_remark>",         10, INSDREFERENCE_REMARK},
122     {NULL,                             -1, -1}
123 };
124 
125 XmlKwordBlk xmqualkwl[] = {
126     {"<INSDQualifier_name>",            1, INSDQUALIFIER_NAME},
127     {"<INSDQualifier_value>",           2, INSDQUALIFIER_VALUE},
128     {NULL,                             -1, -1}
129 };
130 
131 XmlKwordBlk xmxrefkwl[] = {
132     {"<INSDXref_dbname>",               1, INSDXREF_DBNAME},
133     {"<INSDXref_id>",                   2, INSDXREF_ID},
134     {NULL,                             -1, -1}
135 };
136 
137 XmlKwordBlk xmsubkwl[] = {
138     {"<INSDSecondary-accn>",            1, INSDSECONDARY_ACCN},
139     {"<INSDKeyword>",                   1, INSDKEYWORD},
140     {"<INSDFeature>",                   1, INSDFEATURE},
141     {"<INSDInterval>",                  1, INSDINTERVAL},
142     {"<INSDQualifier>",                 1, INSDQUALIFIER},
143     {"<INSDReference>",                 1, INSDREFERENCE},
144     {"<INSDAuthor>",                    1, INSDAUTHOR},
145     {"<INSDXref>",                      1, INSDXREF},
146     {NULL,                             -1, -1}
147 };
148 
149 /**********************************************************/
XMLIndexNew(void)150 static XmlIndexPtr XMLIndexNew(void)
151 {
152     XmlIndexPtr xip;
153 
154     xip = (XmlIndexPtr) MemNew(sizeof(XmlIndex));
155     xip->tag = -1;
156     xip->order = -1;
157     xip->start = 0;
158     xip->end = 0;
159     xip->start_line = -1;
160     xip->end_line = -1;
161     xip->subtags = NULL;
162     xip->next = NULL;
163     return(xip);
164 }
165 
166 /**********************************************************/
XMLRestoreSpecialCharacters(char * buf)167 static void XMLRestoreSpecialCharacters(char* buf)
168 {
169     char* p;
170     char* q;
171 
172     for(p = buf, q = buf; *p != '\0';)
173     {
174         if(StringNCmp(p, "&lt;", 4) == 0)
175         {
176             *q++ = '<';
177             p += 4;
178         }
179         else if(StringNCmp(p, "&gt;", 4) == 0)
180         {
181             *q++ = '>';
182             p += 4;
183         }
184         else if(StringNCmp(p, "&amp;", 5) == 0)
185         {
186             *q++ = '&';
187             p += 5;
188         }
189         else if(StringNCmp(p, "&apos;", 6) == 0)
190         {
191             *q++ = '\'';
192             p += 6;
193         }
194         else if(StringNCmp(p, "&quot;", 6) == 0)
195         {
196             *q++ = '\"';
197             p += 6;
198         }
199         else
200             *q++ = *p++;
201     }
202     *q = '\0';
203 }
204 
205 /**********************************************************/
XMLGetTagValue(char * entry,XmlIndexPtr xip)206 char* XMLGetTagValue(char* entry, XmlIndexPtr xip)
207 {
208     if(entry == NULL || xip == NULL || xip->start == 0 || xip->end == 0 ||
209        xip->start >= xip->end)
210         return(NULL);
211 
212     size_t i = xip->end - xip->start;
213     char* buf = (char*)MemNew(i + 1);
214     StringNCpy(buf, entry + xip->start, i);
215     buf[i] = '\0';
216 
217     XMLRestoreSpecialCharacters(buf);
218     return(buf);
219 }
220 
221 /**********************************************************/
XMLFindTagValue(char * entry,XmlIndexPtr xip,Int4 tag)222 char* XMLFindTagValue(char* entry, XmlIndexPtr xip, Int4 tag)
223 {
224     for(; xip != NULL; xip = xip->next)
225         if(xip->tag == tag)
226             break;
227     if(xip == NULL)
228         return(NULL);
229     return(XMLGetTagValue(entry, xip));
230 }
231 
232 /**********************************************************/
XMLDelSegnum(IndexblkPtr ibp,char * segnum,size_t len2)233 static bool XMLDelSegnum(IndexblkPtr ibp, char* segnum, size_t len2)
234 {
235     if(segnum == NULL)
236         return false;
237     size_t len1 = StringLen(segnum);
238     if(len2 < len1)
239         return false;
240 
241     /* check, is there enough digits to delete
242      */
243     size_t tlen = len1;
244     char* str = ibp->blocusname;
245     size_t i = StringLen(str) - 1;
246     for(; tlen > 0 && str[i] >= '0' && str[i] <= '9'; i--)
247         tlen--;
248 
249     if(tlen != 0 || i < 0)
250         return false;
251 
252     if(len2 > len1 && str[i] == '0')
253     {
254         /* check, is there enough "0" appended
255          */
256         for(tlen = len2 - len1; tlen > 0 && str[i] == '0'; i--)
257             tlen--;
258 
259         if(tlen != 0)
260             return false;
261     }
262 
263     char* p;
264     char* q;
265     for (q = &str[i + 1], p = q; *p == '0';)
266         p++;
267 
268     i = atoi(segnum);
269     if(atoi(p) != (int) i)
270     {
271         ErrPostEx(SEV_REJECT, ERR_SEGMENT_BadLocusName,
272                   "Segment suffix in locus name \"%s\" does not match number in <INSDSEQ_segment> line = \"%d\". Entry dropped.",
273                   str, i);
274         ibp->drop = 1;
275     }
276 
277     *q = '\0';                          /* strip off "len" characters */
278     return true;
279 }
280 
281 /**********************************************************/
XMLGetSegment(char * entry,IndexblkPtr ibp)282 static void XMLGetSegment(char* entry, IndexblkPtr ibp)
283 {
284     TokenStatBlkPtr stoken;
285     XmlIndexPtr     xip;
286     char*         buf;
287     char*         segnum;
288     char*         segtotal;
289 
290     if(entry == NULL || ibp == NULL || ibp->xip == NULL)
291         return;
292 
293     for(xip = ibp->xip; xip != NULL; xip = xip->next)
294         if(xip->tag == INSDSEQ_SEGMENT)
295             break;
296     if(xip == NULL)
297         return;
298 
299     buf = XMLGetTagValue(entry, xip);
300     if(buf == NULL)
301         return;
302 
303     stoken = TokenString(buf, ' ');
304 
305     if(stoken->num > 2)
306     {
307         segnum = stoken->list->str;
308         segtotal = stoken->list->next->next->str;
309         ibp->segnum = (Uint2) atoi(segnum);
310 
311         if (!XMLDelSegnum(ibp, segnum, StringLen(segtotal)))
312         {
313             ErrPostEx(SEV_ERROR, ERR_SEGMENT_BadLocusName,
314                       "Bad locus name \"%s\".", ibp->blocusname);
315         }
316 
317         ibp->segtotal = (Uint2) atoi(segtotal);
318     }
319     else
320     {
321         ErrPostEx(SEV_ERROR, ERR_SEGMENT_IncompSeg,
322                   "Incomplete Segment information at line %d.",
323                   xip->start_line);
324     }
325 
326     FreeTokenstatblk(stoken);
327     MemFree(buf);
328 }
329 
330 
s_HasInput(const Parser & config)331 static bool s_HasInput(const Parser& config) {
332     return (config.ffbuf.start != nullptr);
333 }
334 
335 
s_GetCharAndAdvance(Parser & config)336 static int s_GetCharAndAdvance(Parser& config) {
337     if (*config.ffbuf.current == '\0') {
338         return -1;
339     }
340     return *(config.ffbuf.current++);
341 }
342 
s_SetPointer(Parser & config,int offset)343 void s_SetPointer(Parser& config, int offset) {
344     config.ffbuf.current = config.ffbuf.start + offset;
345 }
346 
347 /**********************************************************/
XMLPerformIndex(ParserPtr pp)348 static void XMLPerformIndex(ParserPtr pp)
349 {
350     XmlKwordBlkPtr xkbp;
351     IndBlkNextPtr  ibnp;
352     IndBlkNextPtr  tibnp;
353     XmlIndexPtr    xip;
354     IndexblkPtr    ibp;
355     char*        p;
356     Char           s[60];
357     Char           ch;
358     size_t         count;
359     Int4           line;
360     Int4           c;
361     Int4           i;
362 
363 
364     if (!pp || !s_HasInput(*pp)) {
365         return;
366     }
367 
368     c = 0;
369     s[0] = '\0';
370     bool within = false;
371     tibnp = NULL;
372     ibnp = NULL;
373     ibp = NULL;
374     xip = NULL;
375     pp->indx = 0;
376     size_t start_len = StringLen(INSDSEQ_START);
377     for(count = 0, line = 1;;)
378     {
379         if(c != '<')
380         {
381             c = s_GetCharAndAdvance(*pp);
382             if(c < 0)
383                 break;
384             count++;
385             if((Char) c == '\n')
386                 line++;
387         }
388         if(c != '<')
389             continue;
390 
391         s[0] = '<';
392         for(i = 1; i < 50; i++)
393         {
394             c = s_GetCharAndAdvance(*pp);
395             if(c < 0)
396                 break;
397             count++;
398             ch = (Char) c;
399             if(ch == '\n')
400                 line++;
401             s[i] = ch;
402             if(ch == '<' || ch == '>')
403                 break;
404         }
405         if(c < 0)
406             break;
407         if(ch == '<')
408             continue;
409         s[++i] = '\0';
410         if(StringCmp(s, INSDSEQ_START) == 0)
411         {
412             if (within)
413                 continue;
414 
415             within = true;
416             if(ibnp == NULL)
417             {
418                 ibnp = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
419                 tibnp = ibnp;
420             }
421             else
422             {
423                 tibnp->next = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
424                 tibnp = tibnp->next;
425             }
426             tibnp->next = NULL;
427             tibnp->ibp = new Indexblk;
428             ibp = tibnp->ibp;
429             ibp->xip = NULL;
430             ibp->offset = count - start_len;
431             ibp->linenum = line;
432             ibp->len = 0;
433             pp->indx++;
434             continue;
435         }
436         if (!within)
437         {
438             if(StringCmp(s, INSDSEQ_END) == 0)
439                 ErrPostEx(SEV_ERROR, ERR_FORMAT_UnexpectedEnd,
440                           "Unexpected end tag \"%s\" of XML record found at line %d.",
441                           s, line);
442             continue;
443         }
444         if(StringCmp(s, INSDSEQ_END) == 0)
445         {
446             ibp->len = count - ibp->offset;
447             within = false;
448             continue;
449         }
450         p = s + ((s[1] == '/') ? 2 : 1);
451         for(xkbp = xmkwl; xkbp->str != NULL; xkbp++)
452             if(StringCmp(p, xkbp->str + 1) == 0)
453                 break;
454         if(xkbp->str == NULL)
455             continue;
456         if(ibp->xip == NULL || xip->tag != xkbp->tag)
457         {
458             if(ibp->xip == NULL)
459             {
460                 ibp->xip = XMLIndexNew();
461                 xip = ibp->xip;
462             }
463             else
464             {
465                 xip->next = XMLIndexNew();
466                 xip = xip->next;
467             }
468             xip->tag = xkbp->tag;
469             xip->order = xkbp->order;
470             if(s[1] == '/')
471             {
472                 xip->end = count - i - ibp->offset;
473                 xip->end_line = line;
474             }
475             else
476             {
477                 xip->start = count - ibp->offset;
478                 xip->start_line = line;
479             }
480             continue;
481         }
482         if(s[1] == '/')
483         {
484             if(xip->end != 0)
485             {
486                 xip->next = XMLIndexNew();
487                 xip = xip->next;
488                 xip->tag = xkbp->tag;
489                 xip->order = xkbp->order;
490             }
491             xip->end = count - i - ibp->offset;
492             xip->end_line = line;
493         }
494         else
495         {
496             if(xip->start != 0)
497             {
498                 xip->next = XMLIndexNew();
499                 xip = xip->next;
500                 xip->tag = xkbp->tag;
501                 xip->order = xkbp->order;
502             }
503             xip->start = count - ibp->offset;
504             xip->start_line = line;
505         }
506     }
507 
508     pp->entrylist = (IndexblkPtr*) MemNew((pp->indx + 1) *
509                                               sizeof(IndexblkPtr));
510     for(tibnp = ibnp, i = 0; tibnp != NULL; i++, tibnp = ibnp)
511     {
512         pp->entrylist[i] = tibnp->ibp;
513         ibnp = tibnp->next;
514         MemFree(tibnp);
515     }
516     pp->entrylist[i] = NULL;
517 }
518 
519 /**********************************************************/
XMLParseVersion(IndexblkPtr ibp,char * line)520 static void XMLParseVersion(IndexblkPtr ibp, char* line)
521 {
522     char* p;
523     char* q;
524 
525     if(line == NULL)
526     {
527         ErrPostEx(SEV_FATAL, ERR_VERSION_BadVersionLine,
528                   "Empty <INSDSeq_accession-version> line. Entry dropped.");
529         ibp->drop = 1;
530         return;
531     }
532 
533     for(p = line; *p != '\0' && *p != ' ' && *p != '\t';)
534         p++;
535     if(*p != '\0')
536     {
537         ErrPostEx(SEV_FATAL, ERR_VERSION_BadVersionLine,
538                   "Incorrect <INSDSeq_accession-version> line: \"%s\". Entry dropped.",
539                   line);
540         ibp->drop = 1;
541         return;
542     }
543     q = StringRChr(line, '.');
544     if(q == NULL)
545     {
546         ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum,
547                   "Missing version number in <INSDSeq_accession-version> line: \"%s\". Entry dropped.",
548                   line);
549         ibp->drop = 1;
550         return;
551     }
552     for(p = q + 1; *p >= '0' && *p <= '9';)
553         p++;
554     if(*p != '\0')
555     {
556         ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum,
557                   "Incorrect VERSION number in <INSDSeq_accession-version> line: \"%s\". Entry dropped.",
558                   line);
559         ibp->drop = 1;
560         return;
561     }
562     *q = '\0';
563     if(ibp->acnum == NULL || StringCmp(ibp->acnum, line) != 0)
564     {
565         *q = '.';
566         ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch,
567                   "Accessions in <INSDSeq_accession-version> and <INSDSeq_primary-accession> lines don't match: \"%s\" vs \"%s\". Entry dropped.",
568                   line, (ibp->acnum == NULL) ? "NULL" : ibp->acnum);
569         ibp->drop = 1;
570         return;
571     }
572     *q++ = '.';
573     ibp->vernum = atoi(q);
574 
575     if(ibp->vernum > 0)
576         return;
577 
578     ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion,
579               "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer. Entry dropped.",
580               ibp->vernum, ibp->acnum, ibp->vernum);
581     ibp->drop = 1;
582 }
583 
584 /**********************************************************/
XMLInitialEntry(IndexblkPtr ibp,char * entry,bool accver,Parser::ESource source)585 static void XMLInitialEntry(IndexblkPtr ibp, char* entry, bool accver,
586                             Parser::ESource source)
587 {
588     XmlIndexPtr xip;
589     char*     buf;
590 
591     if(ibp == NULL || ibp->xip == NULL || entry == NULL)
592         return;
593     xip = ibp->xip;
594 
595     if(source == Parser::ESource::USPTO)
596         ibp->is_pat = true;
597 
598     ibp->locusname[0] = '\0';
599     ibp->acnum[0] = '\0';
600     for(xip = ibp->xip; xip != NULL; xip = xip->next)
601     {
602         if(xip->tag == INSDSEQ_LOCUS && ibp->locusname[0] == '\0')
603         {
604             if(xip->start == 0 || xip->end == 0 || xip->start >= xip->end ||
605                source == Parser::ESource::USPTO)
606             {
607                 StringCpy(ibp->locusname, "???");
608                 StringCpy(ibp->blocusname, "???");
609                 continue;
610             }
611             size_t imax = xip->end - xip->start;
612             if(imax > (int) sizeof(ibp->locusname) - 1)
613                 imax = sizeof(ibp->locusname) - 1;
614             StringNCpy(ibp->locusname, entry + xip->start, imax);
615             ibp->locusname[imax] = '\0';
616             StringCpy(ibp->blocusname, ibp->locusname);
617         }
618         else if(xip->tag == INSDSEQ_PRIMARY_ACCESSION && ibp->acnum[0] == '\0')
619         {
620             if(xip->start == 0 || xip->end == 0 || xip->start >= xip->end)
621             {
622                 StringCpy(ibp->acnum, "???");
623                 continue;
624             }
625             size_t imax = xip->end - xip->start;
626             if(imax > (int) sizeof(ibp->acnum) - 1)
627                 imax = sizeof(ibp->acnum) - 1;
628             StringNCpy(ibp->acnum, entry + xip->start, imax);
629             ibp->acnum[imax] = '\0';
630         }
631         if(ibp->locusname[0] != '\0' && ibp->acnum[0] != '\0')
632             break;
633     }
634 
635     FtaInstallPrefix(PREFIX_LOCUS, ibp->locusname, NULL);
636     if(ibp->acnum[0] == '\0')
637         FtaInstallPrefix(PREFIX_ACCESSION, ibp->locusname, NULL);
638     else
639         FtaInstallPrefix(PREFIX_ACCESSION, ibp->acnum, NULL);
640 
641     if(accver)
642     {
643         for(xip = ibp->xip; xip != NULL; xip = xip->next)
644         {
645             if(xip->tag != INSDSEQ_ACCESSION_VERSION)
646                 continue;
647             buf = XMLGetTagValue(entry, xip);
648             XMLParseVersion(ibp, buf);
649             if(buf != NULL)
650             {
651                 FtaInstallPrefix(PREFIX_ACCESSION, buf, NULL);
652                 MemFree(buf);
653             }
654             break;
655         }
656     }
657 
658     ibp->bases = 0;
659     ibp->date = NULL;
660     StringCpy(ibp->division, "???");
661     for(xip = ibp->xip; xip != NULL; xip = xip->next)
662     {
663         if(xip->tag == INSDSEQ_LENGTH && ibp->bases == 0)
664         {
665             buf = XMLGetTagValue(entry, xip);
666             if(buf == NULL)
667                 continue;
668             ibp->bases = (size_t) atoi(buf);
669             MemFree(buf);
670         }
671         else if(xip->tag == INSDSEQ_UPDATE_DATE && ibp->date == NULL)
672         {
673             buf = XMLGetTagValue(entry, xip);
674             if(buf == NULL)
675                 continue;
676             ibp->date = GetUpdateDate(buf, source);
677             MemFree(buf);
678         }
679         else if(xip->tag == INSDSEQ_DIVISION && ibp->division[0] == '?')
680         {
681             if(xip->start == 0 || xip->end == 0 || xip->start >= xip->end ||
682                xip->end - xip->start < 3)
683                 continue;
684             StringNCpy(ibp->division, entry + xip->start, 3);
685             ibp->division[3] = '\0';
686             if(StringICmp(ibp->division, "EST") == 0)
687                 ibp->EST = true;
688             else if(StringCmp(ibp->division, "STS") == 0)
689                 ibp->STS = true;
690             else if(StringCmp(ibp->division, "GSS") == 0)
691                 ibp->GSS = true;
692             else if(StringCmp(ibp->division, "HTC") == 0)
693                 ibp->HTC = true;
694         }
695         if(ibp->bases > 0 && ibp->date != NULL && ibp->division[0] != '?')
696             break;
697     }
698 }
699 
700 /**********************************************************/
XMLStringByTag(XmlKwordBlkPtr xkbp,Int4 tag)701 static const char *XMLStringByTag(XmlKwordBlkPtr xkbp, Int4 tag)
702 {
703     for(; xkbp->str != NULL; xkbp++)
704         if(xkbp->tag == tag)
705             break;
706     if(xkbp->str == NULL)
707         return("???");
708     return(xkbp->str);
709 }
710 
711 /**********************************************************/
XMLTagCheck(XmlIndexPtr xip,XmlKwordBlkPtr xkbp)712 static bool XMLTagCheck(XmlIndexPtr xip, XmlKwordBlkPtr xkbp)
713 {
714     XmlIndexPtr txip;
715     bool ret = true;
716     for(txip = xip; txip != NULL; txip = txip->next)
717     {
718         if(txip->start == 0)
719         {
720             ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingStartTag,
721                       "XML record's missing start tag for \"%s\" at line %d.",
722                       XMLStringByTag(xkbp, txip->tag), txip->end_line);
723             ret = false;
724         }
725         if(txip->end == 0)
726         {
727             ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingEndTag,
728                       "XML record's missing end tag for \"%s\" at line %d.",
729                       XMLStringByTag(xkbp, txip->tag), txip->start_line);
730             ret = false;
731         }
732         if(txip->next != NULL && txip->order >= txip->next->order)
733         {
734             ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
735                       "XML tag \"%s\" at line %d is out of order.",
736                       XMLStringByTag(xkbp, txip->next->tag),
737                       (txip->next->start > 0) ? txip->next->start_line :
738                                                 txip->next->end_line);
739             ret = false;
740         }
741     }
742     return(ret);
743 }
744 
745 /**********************************************************/
XMLSameTagsCheck(XmlIndexPtr xip,char * name)746 static bool XMLSameTagsCheck(XmlIndexPtr xip, char* name)
747 {
748     bool ret = true;
749 
750     for (XmlIndexPtr txip = xip; txip != NULL; txip = txip->next)
751     {
752         if(txip->start == 0)
753         {
754             ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingStartTag,
755                       "XML record's missing start tag for \"%s\" at line %d.",
756                       name, txip->end_line);
757             ret = false;
758         }
759         if(txip->end == 0)
760         {
761             ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingEndTag,
762                       "XML record's missing end tag for \"%s\" at line %d.",
763                       name, txip->start_line);
764             ret = false;
765         }
766     }
767     return(ret);
768 }
769 
770 /**********************************************************/
XMLIndexSameSubTags(char * entry,XmlIndexPtr xip,Int4 tag)771 static XmlIndexPtr XMLIndexSameSubTags(char* entry, XmlIndexPtr xip,
772                                        Int4 tag)
773 {
774     XmlIndexPtr xipsub;
775     XmlIndexPtr txipsub;
776     char*     name;
777     char*     c;
778     char*     p;
779     size_t      count;
780     Char        s[60];
781     Int4        line;
782     Int4        i;
783 
784     if(entry == NULL || xip == NULL)
785         return(NULL);
786 
787     name = (char*) XMLStringByTag(xmsubkwl, tag);
788     if(name == NULL)
789         return(NULL);
790 
791     s[0] = '\0';
792     xipsub = NULL;
793     txipsub = NULL;
794     line = xip->start_line;
795     c = entry + xip->start;
796     for(count = xip->start + 1;;)
797     {
798         if(*c != '<')
799         {
800             c++;
801             count++;
802             if(*c == '\0' || count > xip->end)
803                 break;
804             if(*c == '\n')
805                 line++;
806         }
807         if(*c != '<')
808             continue;
809 
810         for(s[0] = '<', i = 1; i < 50; i++)
811         {
812             c++;
813             count++;
814             if(*c == '\0' || count > xip->end)
815                 break;
816             if(*c == '\n')
817                 line++;
818             s[i] = *c;
819             if(*c == '<' || *c == '>')
820                 break;
821         }
822         if(*c == '\0' || count > xip->end)
823             break;
824         if(*c == '<')
825             continue;
826         s[++i] = '\0';
827         p = s + ((s[1] == '/') ? 2 : 1);
828         if(StringCmp(p, name + 1) != 0)
829             continue;
830 
831         if(xipsub == NULL)
832         {
833             xipsub = XMLIndexNew();
834             txipsub = xipsub;
835         }
836         else if((s[1] != '/' && txipsub->start != 0) ||
837                 (s[1] == '/' && txipsub->end != 0))
838         {
839             txipsub->next = XMLIndexNew();
840             txipsub = txipsub->next;
841         }
842         if(s[1] == '/')
843         {
844             txipsub->end = count - i;
845             txipsub->end_line = line;
846         }
847         else
848         {
849             txipsub->start = count;
850             txipsub->start_line = line;
851         }
852         txipsub->tag = tag;
853     }
854 
855     if(XMLSameTagsCheck(xipsub, name))
856         return(xipsub);
857 
858     XMLIndexFree(xipsub);
859     return(NULL);
860 }
861 
862 /**********************************************************/
XMLAccessionsCheck(ParserPtr pp,IndexblkPtr ibp,char * entry)863 static bool XMLAccessionsCheck(ParserPtr pp, IndexblkPtr ibp, char* entry)
864 {
865     XmlIndexPtr xip;
866     XmlIndexPtr xipsec;
867     char*     buf;
868     char*     p;
869 
870     bool ret = true;
871     size_t len = StringLen(ibp->acnum) + StringLen(XML_FAKE_ACC_TAG) + 1;
872 
873     for(xip = ibp->xip; xip != NULL; xip = xip->next)
874         if(xip->tag == INSDSEQ_SECONDARY_ACCESSIONS)
875             break;
876 
877     if(xip == NULL)
878     {
879         buf = (char*) MemNew(len);
880         StringCpy(buf, XML_FAKE_ACC_TAG);
881         StringCat(buf, ibp->acnum);
882         ret = GetAccession(pp, buf, ibp, 2);
883         MemFree(buf);
884         return(ret);
885     }
886 
887     xip->subtags = XMLIndexSameSubTags(entry, xip, INSDSECONDARY_ACCN);
888     if(xip->subtags == NULL)
889     {
890         p = (char*) XMLStringByTag(xmkwl, INSDSEQ_SECONDARY_ACCESSIONS);
891         ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
892                   "Incorrectly formatted \"%s\" XML block. Entry dropped.", p);
893         ibp->drop = 1;
894         return false;
895     }
896 
897     for(xipsec = xip->subtags; xipsec != NULL; xipsec = xipsec->next)
898         len += (xipsec->end - xipsec->start + 1);
899 
900     buf = (char*) MemNew(len);
901     StringCpy(buf, XML_FAKE_ACC_TAG);
902     StringCat(buf, ibp->acnum);
903     for(xipsec = xip->subtags; xipsec != NULL; xipsec = xipsec->next)
904     {
905         p = XMLGetTagValue(entry, xipsec);
906         if(p == NULL)
907             continue;
908         StringCat(buf, " ");
909         StringCat(buf, p);
910         MemFree(p);
911     }
912     ret = GetAccession(pp, buf, ibp, 2);
913     MemFree(buf);
914     return(ret);
915 }
916 
917 /**********************************************************/
XMLKeywordsCheck(char * entry,IndexblkPtr ibp,Parser::ESource source)918 static bool XMLKeywordsCheck(char* entry, IndexblkPtr ibp, Parser::ESource source)
919 {
920     XmlIndexPtr xip;
921     XmlIndexPtr xipkwd;
922     ValNodePtr  vnp;
923     char*     buf;
924     char*     p;
925 
926     bool tpa_check = (source == Parser::ESource::EMBL);
927 
928     if(entry == NULL || ibp == NULL || ibp->xip == NULL)
929         return true;
930 
931     for(xip = ibp->xip; xip != NULL; xip = xip->next)
932         if(xip->tag == INSDSEQ_KEYWORDS)
933             break;
934     if(xip == NULL)
935         return true;
936 
937     xip->subtags = XMLIndexSameSubTags(entry, xip, INSDKEYWORD);
938     if(xip->subtags == NULL)
939     {
940         p = (char*) XMLStringByTag(xmkwl, INSDSEQ_KEYWORDS);
941         ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
942                   "Incorrectly formatted \"%s\" XML block. Entry dropped.", p);
943         ibp->drop = 1;
944         return false;
945     }
946 
947     size_t len = 0;
948     for(xipkwd = xip->subtags; xipkwd != NULL; xipkwd = xipkwd->next)
949         len += (xipkwd->end - xipkwd->start + 2);
950 
951     buf = (char*) MemNew(len);
952     *buf = '\0';
953     for(xipkwd = xip->subtags; xipkwd != NULL; xipkwd = xipkwd->next)
954     {
955         p = XMLGetTagValue(entry, xipkwd);
956         if(p == NULL)
957             continue;
958         if(*buf != '\0')
959             StringCat(buf, "; ");
960         StringCat(buf, p);
961         MemFree(p);
962     }
963 
964     vnp = ConstructValNode(NULL, 0, buf);
965     check_est_sts_gss_tpa_kwds(vnp, len, ibp, tpa_check, ibp->specialist_db,
966                                ibp->inferential, ibp->experimental,
967                                ibp->assembly);
968     MemFree(buf);
969     MemFree(vnp);
970     return true;
971 }
972 
973 /**********************************************************/
XMLErrField(Int4 tag)974 static bool XMLErrField(Int4 tag)
975 {
976     ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
977               "No %s data in XML format file. Entry dropped.",
978               XMLStringByTag(xmkwl, tag));
979     return false;
980 }
981 
982 /**********************************************************/
XMLCheckRequiredTags(ParserPtr pp,IndexblkPtr ibp)983 static bool XMLCheckRequiredTags(ParserPtr pp, IndexblkPtr ibp)
984 {
985     XmlIndexPtr xip;
986     bool     got_locus = false;
987     bool     got_length = false;
988     bool     got_moltype = false;
989     bool     got_division = false;
990     bool     got_update_date = false;
991     bool     got_definition = false;
992     bool     got_accession = false;
993     bool     got_version = false;
994     bool     got_source = false;
995     bool     got_organism = false;
996     bool     got_reference = false;
997     bool     got_primary = false;
998     bool     got_features = false;
999     bool     ret = true;
1000 
1001     ibp->origin = false;
1002     ibp->is_contig = false;
1003     for(xip = ibp->xip; xip != NULL; xip = xip->next)
1004     {
1005         if(xip->tag == INSDSEQ_LOCUS && pp->source != Parser::ESource::USPTO)
1006             got_locus = true;
1007         else if(xip->tag == INSDSEQ_LENGTH)
1008             got_length = true;
1009         else if(xip->tag == INSDSEQ_MOLTYPE)
1010             got_moltype = true;
1011         else if(xip->tag == INSDSEQ_DIVISION)
1012             got_division = true;
1013         else if(xip->tag == INSDSEQ_UPDATE_DATE)
1014             got_update_date = true;
1015         else if(xip->tag == INSDSEQ_DEFINITION)
1016             got_definition = true;
1017         else if(xip->tag == INSDSEQ_PRIMARY_ACCESSION)
1018             got_accession = true;
1019         else if(xip->tag == INSDSEQ_ACCESSION_VERSION)
1020             got_version = true;
1021         else if(xip->tag == INSDSEQ_SOURCE)
1022             got_source = true;
1023         else if(xip->tag == INSDSEQ_ORGANISM)
1024             got_organism = true;
1025         else if(xip->tag == INSDSEQ_REFERENCES)
1026             got_reference = true;
1027         else if(xip->tag == INSDSEQ_PRIMARY)
1028             got_primary = true;
1029         else if(xip->tag == INSDSEQ_FEATURE_TABLE)
1030             got_features = true;
1031         else if(xip->tag == INSDSEQ_CONTIG)
1032             ibp->is_contig = true;
1033         else if(xip->tag == INSDSEQ_SEQUENCE)
1034             ibp->origin = true;
1035     }
1036 
1037     if(got_locus == false && pp->source != Parser::ESource::USPTO)
1038         ret = XMLErrField(INSDSEQ_LOCUS);
1039     if(got_length == false)
1040         ret = XMLErrField(INSDSEQ_LENGTH);
1041     if(got_moltype == false)
1042         ret = XMLErrField(INSDSEQ_MOLTYPE);
1043     if(got_division == false)
1044         ret = XMLErrField(INSDSEQ_DIVISION);
1045     if(got_update_date == false && pp->source != Parser::ESource::USPTO)
1046         ret = XMLErrField(INSDSEQ_UPDATE_DATE);
1047     if(got_definition == false)
1048         ret = XMLErrField(INSDSEQ_DEFINITION);
1049     if(got_accession == false)
1050     {
1051         ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
1052                   "No accession number for this record. Entry dropped.");
1053         ret = false;
1054     }
1055     if(got_version == false)
1056     {
1057         if(pp->accver != false)
1058             ret = XMLErrField(INSDSEQ_ACCESSION_VERSION);
1059     }
1060     else if(pp->source == Parser::ESource::USPTO)
1061     {
1062         ErrPostEx(SEV_REJECT, ERR_ENTRY_InvalidLineType,
1063                   "Line type %s is not allowed for USPTO records. Entry dropped.",
1064                   XMLStringByTag(xmkwl, INSDSEQ_PRIMARY));
1065         ret = false;
1066     }
1067     if(got_source == false)
1068         ret = XMLErrField(INSDSEQ_SOURCE);
1069     if(got_organism == false)
1070         ret = XMLErrField(INSDSEQ_ORGANISM);
1071     if(got_reference == false && pp->source != Parser::ESource::Flybase &&
1072        ibp->is_wgs == false &&
1073        (pp->source != Parser::ESource::Refseq ||
1074         StringNCmp(ibp->acnum, "NW_", 3) != 0))
1075         ret = XMLErrField(INSDSEQ_REFERENCES);
1076     if (got_primary && ibp->is_tpa == false && ibp->tsa_allowed == false)
1077     {
1078         ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
1079                   "Line type %s is allowed for TPA or TSA records only. Continue anyway.",
1080                   XMLStringByTag(xmkwl, INSDSEQ_PRIMARY));
1081     }
1082     if(got_features == false)
1083         ret = XMLErrField(INSDSEQ_FEATURE_TABLE);
1084     if(ibp->is_contig && ibp->segnum != 0)
1085     {
1086         ErrPostEx(SEV_ERROR, ERR_FORMAT_ContigInSegset,
1087                   "%s data are not allowed for members of segmented sets. Entry dropped.",
1088                   XMLStringByTag(xmkwl, INSDSEQ_CONTIG));
1089         ret = false;
1090     }
1091 
1092     ibp->is_tpa_wgs_con = (ibp->is_contig && ibp->is_wgs && ibp->is_tpa);
1093 
1094     return(ret);
1095 }
1096 
1097 /**********************************************************/
XMLLoadEntry(ParserPtr pp,bool err)1098 char* XMLLoadEntry(ParserPtr pp, bool err)
1099 {
1100     IndexblkPtr ibp;
1101     char*     entry;
1102     char*     p;
1103     size_t      i;
1104     Int4        c;
1105 
1106     if (!pp || !s_HasInput(*pp)) {
1107         return nullptr;
1108     }
1109 
1110     ibp = pp->entrylist[pp->curindx];
1111     if(ibp == NULL || ibp->len == 0)
1112         return(NULL);
1113 
1114     entry = (char*) MemNew(ibp->len + 1);
1115     s_SetPointer(*pp, ibp->offset);
1116 
1117 
1118     for(p = entry, i = 0; i < ibp->len; i++)
1119     {
1120         c = s_GetCharAndAdvance(*pp);
1121         if(c < 0)
1122             break;
1123         if (c == 13) {
1124             c = 10;
1125         }
1126         if(c > 126 || (c < 32 && c != 10))
1127         {
1128             if (err)
1129                 ErrPostEx(SEV_WARNING, ERR_FORMAT_NonAsciiChar,
1130                           "None-ASCII character within the record which begins at line %d, decimal value %d, replaced by #.",
1131                           ibp->linenum, c);
1132             *p++ = '#';
1133         }
1134         else
1135             *p++ = (Char) c;
1136     }
1137     if(i != ibp->len)
1138     {
1139         MemFree(entry);
1140         return(NULL);
1141     }
1142     *p = '\0';
1143 
1144     return(entry);
1145 }
1146 
1147 
1148 /**********************************************************/
XMLIndexSubTags(char * entry,XmlIndexPtr xip,XmlKwordBlkPtr xkbp)1149 static bool XMLIndexSubTags(char* entry, XmlIndexPtr xip, XmlKwordBlkPtr xkbp)
1150 {
1151     XmlKwordBlkPtr txkbp;
1152     XmlIndexPtr    xipsub;
1153     char*        c;
1154     char*        p;
1155     Char           s[60];
1156     size_t         count;
1157     Int4           line;
1158     Int4           i;
1159 
1160     if(entry == NULL || xip == NULL)
1161         return false;
1162 
1163     s[0] = '\0';
1164     xipsub = NULL;
1165     line = xip->start_line;
1166     c = entry + xip->start;
1167     for(count = xip->start + 1;;)
1168     {
1169         if(*c != '<')
1170         {
1171             c++;
1172             count++;
1173             if(*c == '\0' || count > xip->end)
1174                 break;
1175             if(*c == '\n')
1176                 line++;
1177         }
1178         if(*c != '<')
1179             continue;
1180 
1181         for(s[0] = '<', i = 1; i < 50; i++)
1182         {
1183             c++;
1184             count++;
1185             if(*c == '\0' || count > xip->end)
1186                 break;
1187             if(*c == '\n')
1188                 line++;
1189             s[i] = *c;
1190             if(*c == '<' || *c == '>')
1191                 break;
1192         }
1193         if(*c == '\0' || count > xip->end)
1194             break;
1195         if(*c == '<')
1196             continue;
1197         s[++i] = '\0';
1198         p = s + ((s[1] == '/') ? 2 : 1);
1199         for(txkbp = xkbp; txkbp->str != NULL; txkbp++)
1200             if(StringCmp(p, txkbp->str + 1) == 0)
1201                 break;
1202         if(txkbp->str == NULL)
1203             continue;
1204         if(xipsub == NULL || xipsub->tag != txkbp->tag)
1205         {
1206             if(xipsub == NULL)
1207             {
1208                 xipsub = XMLIndexNew();
1209                 xip->subtags = xipsub;
1210             }
1211             else
1212             {
1213                 xipsub->next = XMLIndexNew();
1214                 xipsub = xipsub->next;
1215             }
1216             xipsub->tag = txkbp->tag;
1217             xipsub->order = txkbp->order;
1218             if(s[1] == '/')
1219             {
1220                 xipsub->end = count - i;
1221                 xipsub->end_line = line;
1222             }
1223             else
1224             {
1225                 xipsub->start = count;
1226                 xipsub->start_line = line;
1227             }
1228             continue;
1229         }
1230         if(s[1] == '/')
1231         {
1232             if(xipsub->end != 0)
1233             {
1234                 xipsub->next = XMLIndexNew();
1235                 xipsub = xipsub->next;
1236                 xipsub->tag = txkbp->tag;
1237                 xipsub->order = txkbp->order;
1238             }
1239             xipsub->end = count - i;
1240             xipsub->end_line = line;
1241         }
1242         else
1243         {
1244             if(xipsub->start != 0)
1245             {
1246                 xipsub->next = XMLIndexNew();
1247                 xipsub = xipsub->next;
1248                 xipsub->tag = txkbp->tag;
1249                 xipsub->order = txkbp->order;
1250             }
1251             xipsub->start = count;
1252             xipsub->start_line = line;
1253         }
1254     }
1255 
1256     if (!XMLTagCheck(xip->subtags, xkbp))
1257         return false;
1258 
1259     return true;
1260 }
1261 
1262 /**********************************************************/
XMLCheckRequiredFeatTags(XmlIndexPtr xip)1263 static bool XMLCheckRequiredFeatTags(XmlIndexPtr xip)
1264 {
1265     bool got_key = false;
1266     bool got_location = false;
1267     bool ret = true;
1268 
1269     for(; xip != NULL; xip = xip->next)
1270     {
1271         if(xip->tag == INSDFEATURE_KEY)
1272             got_key = true;
1273         else if(xip->tag == INSDFEATURE_LOCATION)
1274             got_location = true;
1275     }
1276 
1277     if(!got_key)
1278     {
1279         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1280                   "Feature table is missing %s data in XML format file.",
1281                   XMLStringByTag(xmfeatkwl, INSDFEATURE_KEY));
1282         ret = false;
1283     }
1284 
1285     if(!got_location)
1286     {
1287         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1288                   "Feature table is missing %s data in XML format file.",
1289                   XMLStringByTag(xmfeatkwl, INSDFEATURE_LOCATION));
1290         ret = false;
1291     }
1292     return(ret);
1293 }
1294 
1295 /**********************************************************/
XMLCheckRequiredIntTags(XmlIndexPtr xip)1296 static bool XMLCheckRequiredIntTags(XmlIndexPtr xip)
1297 {
1298     bool got_from = false;
1299     bool got_to = false;
1300     bool got_point = false;
1301     bool got_accession = false;
1302     bool ret = true;
1303 
1304     for(; xip != NULL; xip = xip->next)
1305     {
1306         if(xip->tag == INSDINTERVAL_FROM)
1307             got_from = true;
1308         else if(xip->tag == INSDINTERVAL_TO)
1309             got_to = true;
1310         else if(xip->tag == INSDINTERVAL_POINT)
1311             got_point = true;
1312         else if(xip->tag == INSDINTERVAL_ACCESSION)
1313             got_accession = true;
1314     }
1315 
1316     if(!got_accession)
1317     {
1318         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1319                   "Feature's interval block is missing %s data in XML format file.",
1320                   XMLStringByTag(xmintkwl, INSDINTERVAL_ACCESSION));
1321         ret = false;
1322     }
1323 
1324     if(got_point)
1325     {
1326         if(got_from || got_to)
1327         {
1328             ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLInvalidINSDInterval,
1329                       "%s tag cannot co-exist with %s or %s or both in XML format.",
1330                       XMLStringByTag(xmintkwl, INSDINTERVAL_POINT),
1331                       XMLStringByTag(xmintkwl, INSDINTERVAL_FROM),
1332                       XMLStringByTag(xmintkwl, INSDINTERVAL_TO));
1333             ret = false;
1334         }
1335     }
1336     else if(got_from == false || got_to == false)
1337     {
1338         ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLInvalidINSDInterval,
1339                   "%s must contain either both of %s and %s, or %s.",
1340                   XMLStringByTag(xmsubkwl, INSDINTERVAL),
1341                   XMLStringByTag(xmintkwl, INSDINTERVAL_FROM),
1342                   XMLStringByTag(xmintkwl, INSDINTERVAL_TO),
1343                   XMLStringByTag(xmintkwl, INSDINTERVAL_POINT));
1344         ret = false;
1345     }
1346 
1347     return(ret);
1348 }
1349 
1350 /**********************************************************/
XMLCheckRequiredQualTags(XmlIndexPtr xip)1351 static bool XMLCheckRequiredQualTags(XmlIndexPtr xip)
1352 {
1353     for (; xip != NULL; xip = xip->next)
1354     {
1355         if (xip->tag == INSDQUALIFIER_NAME)
1356             break;
1357     }
1358 
1359     if(xip != NULL)
1360         return true;
1361 
1362     ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1363               "Qualifier block is missing %s data in XML format file.",
1364               XMLStringByTag(xmqualkwl, INSDQUALIFIER_NAME));
1365     return false;
1366 }
1367 
1368 /**********************************************************/
XMLIndexFeatures(char * entry,XmlIndexPtr xip)1369 static bool XMLIndexFeatures(char* entry, XmlIndexPtr xip)
1370 {
1371     XmlIndexPtr xipfeat;
1372     XmlIndexPtr xipsub;
1373     XmlIndexPtr txip;
1374 
1375     if(xip == NULL || entry == NULL)
1376         return true;
1377 
1378     for (; xip != NULL; xip = xip->next)
1379     {
1380         if (xip->tag == INSDSEQ_FEATURE_TABLE)
1381             break;
1382     }
1383 
1384     if(xip == NULL)
1385         return true;
1386 
1387     xip->subtags = XMLIndexSameSubTags(entry, xip, INSDFEATURE);
1388     if(xip->subtags == NULL)
1389     {
1390         ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1391                   "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1392                   XMLStringByTag(xmkwl, INSDSEQ_FEATURE_TABLE));
1393         return false;
1394     }
1395 
1396     for(xipfeat = xip->subtags; xipfeat != NULL; xipfeat = xipfeat->next)
1397     {
1398         if(XMLIndexSubTags(entry, xipfeat, xmfeatkwl) == false ||
1399            XMLCheckRequiredFeatTags(xipfeat->subtags) == false)
1400             break;
1401         for(txip = xipfeat->subtags; txip != NULL; txip = txip->next)
1402         {
1403             if(txip->tag == INSDFEATURE_INTERVALS)
1404             {
1405                 txip->subtags = XMLIndexSameSubTags(entry, txip, INSDINTERVAL);
1406                 if(txip->subtags == NULL)
1407                     break;
1408                 xipsub = txip->subtags;
1409                 for(; xipsub != NULL; xipsub = xipsub->next)
1410                     if(XMLIndexSubTags(entry, xipsub, xmintkwl) == false ||
1411                        XMLCheckRequiredIntTags(xipsub->subtags) == false)
1412                         break;
1413             }
1414             else if(txip->tag == INSDFEATURE_QUALS)
1415             {
1416                 txip->subtags = XMLIndexSameSubTags(entry, txip,
1417                                                     INSDQUALIFIER);
1418                 if(txip->subtags == NULL)
1419                     break;
1420                 xipsub = txip->subtags;
1421                 for(; xipsub != NULL; xipsub = xipsub->next)
1422                     if(XMLIndexSubTags(entry, xipsub, xmqualkwl) == false ||
1423                        XMLCheckRequiredQualTags(xipsub->subtags) == false)
1424                         break;
1425             }
1426         }
1427         if(txip != NULL)
1428             break;
1429     }
1430 
1431     if(xipfeat == NULL)
1432         return true;
1433 
1434     ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1435               "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1436               XMLStringByTag(xmkwl, INSDSEQ_FEATURE_TABLE));
1437     return false;
1438 }
1439 
1440 /**********************************************************/
XMLCheckRequiredRefTags(XmlIndexPtr xip)1441 static bool XMLCheckRequiredRefTags(XmlIndexPtr xip)
1442 {
1443     bool got_reference = false;
1444     bool got_journal = false;
1445     bool ret = true;
1446 
1447     for(; xip != NULL; xip = xip->next)
1448     {
1449         if(xip->tag == INSDREFERENCE_REFERENCE)
1450             got_reference = true;
1451         else if(xip->tag == INSDREFERENCE_JOURNAL)
1452             got_journal = true;
1453     }
1454 
1455     if (!got_reference)
1456     {
1457         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1458                   "%s block is missing %s data in XML format file.",
1459                   XMLStringByTag(xmsubkwl, INSDREFERENCE),
1460                   XMLStringByTag(xmrefkwl, INSDREFERENCE_REFERENCE));
1461         ret = false;
1462     }
1463 
1464     if (!got_journal)
1465     {
1466         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1467                   "%s block is missing %s data in XML format file.",
1468                   XMLStringByTag(xmsubkwl, INSDREFERENCE),
1469                   XMLStringByTag(xmrefkwl, INSDREFERENCE_JOURNAL));
1470         ret = false;
1471     }
1472     return(ret);
1473 }
1474 
1475 /**********************************************************/
XMLGetRefTypePos(char * reftag,size_t bases)1476 static Int2 XMLGetRefTypePos(char* reftag, size_t bases)
1477 {
1478     Char str[100];
1479 
1480     if(reftag == NULL || *reftag == '\0')
1481         return(ParFlat_REF_NO_TARGET);
1482 
1483     sprintf(str, "1..%d", (int) bases);
1484 
1485     if(StringCmp(reftag, str) == 0)
1486         return(ParFlat_REF_END);
1487     if(StringCmp(reftag, "sites") == 0)
1488         return(ParFlat_REF_SITES);
1489     return(ParFlat_REF_BTW);
1490 }
1491 
1492 /**********************************************************/
XMLGetRefType(char * reftag,size_t bases)1493 static Int2 XMLGetRefType(char* reftag, size_t bases)
1494 {
1495     char* p;
1496     Char    str[100];
1497     Char    str1[100];
1498 
1499     if(reftag == NULL)
1500         return(ParFlat_REF_NO_TARGET);
1501 
1502     for(p = reftag; *p != '\0' && *p != '(';)
1503         p++;
1504     if(*p == '\0')
1505         return(ParFlat_REF_NO_TARGET);
1506 
1507     sprintf(str, "(bases 1 to %d)", (int) bases);
1508     sprintf(str1, "(bases 1 to %d;", (int) bases);
1509 
1510     if(StringStr(p, str) != NULL || StringStr(p, str1) != NULL)
1511         return(ParFlat_REF_END);
1512     if(StringStr(p, "(sites)") != NULL)
1513         return(ParFlat_REF_SITES);
1514     return(ParFlat_REF_BTW);
1515 }
1516 
1517 /**********************************************************/
XMLCheckRequiredXrefTags(XmlIndexPtr xip)1518 static bool XMLCheckRequiredXrefTags(XmlIndexPtr xip)
1519 {
1520     bool got_dbname = false;
1521     bool got_id = false;
1522     bool ret = true;
1523 
1524     for(; xip != NULL; xip = xip->next)
1525     {
1526         if(xip->tag == INSDXREF_DBNAME)
1527             got_dbname = true;
1528         else if(xip->tag == INSDXREF_ID)
1529             got_id = true;
1530     }
1531 
1532     if (!got_dbname)
1533     {
1534         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1535                   "%s block is missing %s data in XML format file.",
1536                   XMLStringByTag(xmsubkwl, INSDXREF),
1537                   XMLStringByTag(xmrefkwl, INSDXREF_DBNAME));
1538         ret = false;
1539     }
1540 
1541     if (!got_id)
1542     {
1543         ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1544                   "%s block is missing %s data in XML format file.",
1545                   XMLStringByTag(xmsubkwl, INSDXREF),
1546                   XMLStringByTag(xmrefkwl, INSDXREF_ID));
1547         ret = false;
1548     }
1549     return(ret);
1550 }
1551 
1552 /**********************************************************/
XMLIndexReferences(char * entry,XmlIndexPtr xip,size_t bases)1553 static bool XMLIndexReferences(char* entry, XmlIndexPtr xip, size_t bases)
1554 {
1555     XmlIndexPtr xipref;
1556     XmlIndexPtr txip;
1557     XmlIndexPtr xipsub;
1558     char*     reftagref;
1559     char*     reftagpos;
1560 
1561     if(xip == NULL || entry == NULL)
1562         return true;
1563 
1564     for (; xip != NULL; xip = xip->next)
1565     {
1566         if (xip->tag == INSDSEQ_REFERENCES)
1567             break;
1568     }
1569     if(xip == NULL)
1570         return true;
1571 
1572     xip->subtags = XMLIndexSameSubTags(entry, xip, INSDREFERENCE);
1573     if(xip->subtags == NULL)
1574     {
1575         ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1576                   "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1577                   XMLStringByTag(xmkwl, INSDSEQ_REFERENCES));
1578         return false;
1579     }
1580 
1581     for(xipref = xip->subtags; xipref != NULL; xipref = xipref->next)
1582     {
1583         if(XMLIndexSubTags(entry, xipref, xmrefkwl) == false ||
1584            XMLCheckRequiredRefTags(xipref->subtags) == false)
1585             break;
1586 
1587         reftagref = NULL;
1588         reftagpos = NULL;
1589         for(txip = xipref->subtags; txip != NULL; txip = txip->next)
1590         {
1591             if(txip->tag == INSDREFERENCE_REFERENCE)
1592             {
1593                 if(reftagref != NULL)
1594                     MemFree(reftagref);
1595                 reftagref = XMLGetTagValue(entry, txip);
1596                 continue;
1597             }
1598             if(txip->tag == INSDREFERENCE_POSITION)
1599             {
1600                 if(reftagpos != NULL)
1601                     MemFree(reftagpos);
1602                 reftagpos = XMLGetTagValue(entry, txip);
1603                 continue;
1604             }
1605             if(txip->tag == INSDREFERENCE_AUTHORS)
1606             {
1607                 txip->subtags = XMLIndexSameSubTags(entry, txip, INSDAUTHOR);
1608                 if(txip->subtags == NULL)
1609                     break;
1610             }
1611             else if(txip->tag == INSDREFERENCE_XREF)
1612             {
1613                 txip->subtags = XMLIndexSameSubTags(entry, txip, INSDXREF);
1614                 if(txip->subtags == NULL)
1615                     break;
1616                 xipsub = txip->subtags;
1617                 for(; xipsub != NULL; xipsub = xipsub->next)
1618                     if(XMLIndexSubTags(entry, xipsub, xmxrefkwl) == false ||
1619                        XMLCheckRequiredXrefTags(xipsub->subtags) == false)
1620                         break;
1621             }
1622         }
1623 
1624         if(reftagpos != NULL)
1625         {
1626             xipref->type = XMLGetRefTypePos(reftagpos, bases);
1627             MemFree(reftagpos);
1628         }
1629         else
1630             xipref->type = XMLGetRefType(reftagref, bases);
1631         if(reftagref != NULL)
1632             MemFree(reftagref);
1633 
1634         if(txip != NULL)
1635             break;
1636     }
1637 
1638     if(xipref == NULL)
1639         return true;
1640 
1641     ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1642               "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1643               XMLStringByTag(xmkwl, INSDSEQ_REFERENCES));
1644     return false;
1645 }
1646 
1647 /**********************************************************/
XMLIndex(ParserPtr pp)1648 bool XMLIndex(ParserPtr pp)
1649 {
1650     IndexblkPtr* ibpp;
1651     IndexblkPtr      ibp;
1652     char*          entry;
1653 
1654     XMLPerformIndex(pp);
1655 
1656     if(pp->indx == 0)
1657         return false;
1658 
1659     pp->curindx = 0;
1660     for(ibpp = pp->entrylist; *ibpp != NULL; ibpp++, pp->curindx++)
1661     {
1662         ibp = *ibpp;
1663         if(ibp->len == 0)
1664         {
1665             ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingEnd,
1666                       "Missing end tag of XML record, which starts at line %d. Entry dropped.",
1667                       ibp->linenum);
1668             ibp->drop = 1;
1669             continue;
1670         }
1671         entry = XMLLoadEntry(pp, true);
1672         if(entry == NULL)
1673         {
1674             ErrPostEx(SEV_FATAL, ERR_INPUT_CannotReadEntry,
1675                       "Failed ro read entry from file, which starts at line %d. Entry dropped.",
1676                       ibp->linenum);
1677             ibp->drop = 1;
1678             continue;
1679         }
1680 
1681         XMLInitialEntry(ibp, entry, pp->accver, pp->source);
1682         if(ibp->drop != 0)
1683         {
1684             MemFree(entry);
1685             continue;
1686         }
1687         if(XMLTagCheck(ibp->xip, xmkwl) == false)
1688         {
1689             ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1690                       "Incorrectly formatted XML record. Entry dropped.");
1691             ibp->drop = 1;
1692             MemFree(entry);
1693             continue;
1694         }
1695         if(XMLAccessionsCheck(pp, ibp, entry) == false)
1696         {
1697             MemFree(entry);
1698             continue;
1699         }
1700         XMLGetSegment(entry, ibp);
1701         if(XMLCheckRequiredTags(pp, ibp) == false)
1702         {
1703             ibp->drop = 1;
1704             MemFree(entry);
1705             continue;
1706         }
1707         if(XMLKeywordsCheck(entry, ibp, pp->source) == false)
1708         {
1709             MemFree(entry);
1710             continue;
1711         }
1712         if(XMLIndexFeatures(entry, ibp->xip) == false ||
1713            XMLIndexReferences(entry, ibp->xip, ibp->bases) == false)
1714         {
1715             ibp->drop = 1;
1716             MemFree(entry);
1717             continue;
1718         }
1719         MemFree(entry);
1720     }
1721 
1722     for(pp->num_drop = 0, ibpp = pp->entrylist; *ibpp != NULL; ibpp++)
1723         if((*ibpp)->drop != 0)
1724             pp->num_drop++;
1725 
1726     if(pp->indx > 0)
1727         return true;
1728     return false;
1729 }
1730 
1731 /**********************************************************/
XMLBuildRefDataBlk(char * entry,XmlIndexPtr xip,Int2 type)1732 DataBlkPtr XMLBuildRefDataBlk(char* entry, XmlIndexPtr xip, Int2 type)
1733 {
1734     XmlIndexPtr txip;
1735     DataBlkPtr  dbp;
1736     DataBlkPtr  tdbp;
1737 
1738     if(entry == NULL || xip == NULL)
1739         return(NULL);
1740 
1741     while(xip != NULL && xip->tag != INSDSEQ_REFERENCES)
1742         xip = xip->next;
1743     if(xip == NULL || xip->subtags == NULL)
1744         return(NULL);
1745 
1746     for(dbp = NULL, txip = xip->subtags; txip != NULL; txip = txip->next)
1747     {
1748         if(txip->type != type || txip->subtags == NULL)
1749             continue;
1750         if(dbp == NULL)
1751         {
1752             dbp = (DataBlkPtr) MemNew(sizeof(DataBlk));
1753             tdbp = dbp;
1754         }
1755         else
1756         {
1757             tdbp->next = (DataBlkPtr) MemNew(sizeof(DataBlk));
1758             tdbp = tdbp->next;
1759         }
1760         tdbp->type = txip->type;
1761         tdbp->offset = entry;
1762         tdbp->data = (void*) txip->subtags;
1763         tdbp->next = NULL;
1764     }
1765     return(dbp);
1766 }
1767 
1768 /**********************************************************/
XMLGetKeywords(char * entry,XmlIndexPtr xip,TKeywordList & keywords)1769 void XMLGetKeywords(char* entry, XmlIndexPtr xip, TKeywordList& keywords)
1770 {
1771     XmlIndexPtr xipkwd;
1772     char*     p;
1773 
1774     keywords.clear();
1775     if(entry == NULL || xip == NULL)
1776         return;
1777 
1778     for(; xip != NULL; xip = xip->next)
1779         if(xip->tag == INSDSEQ_KEYWORDS && xip->subtags != NULL)
1780             break;
1781     if(xip == NULL)
1782         return;
1783 
1784     for(xipkwd = xip->subtags; xipkwd != NULL; xipkwd = xipkwd->next)
1785     {
1786         p = XMLGetTagValue(entry, xipkwd);
1787         if(p == NULL)
1788             continue;
1789 
1790         keywords.push_back(p);
1791         MemFree(p);
1792     }
1793 }
1794 
1795 /**********************************************************/
XMLConcatSubTags(char * entry,XmlIndexPtr xip,Int4 tag,Char sep)1796 char* XMLConcatSubTags(char* entry, XmlIndexPtr xip, Int4 tag, Char sep)
1797 {
1798     XmlIndexPtr txip;
1799     char*     buf;
1800     char*     p;
1801     char*     q;
1802     size_t      i;
1803 
1804     if(entry == NULL || xip == NULL)
1805         return(NULL);
1806 
1807     while(xip != NULL && xip->tag != tag)
1808         xip = xip->next;
1809 
1810     if(xip == NULL || xip->subtags == NULL)
1811         return(NULL);
1812 
1813     for(i = 0, txip = xip->subtags; txip != NULL; txip = txip->next)
1814         i += (txip->end - txip->start + 2);
1815 
1816     buf = (char*) MemNew(i);
1817     buf[0] = '\0';
1818     for(q = buf, txip = xip->subtags; txip != NULL; txip = txip->next)
1819     {
1820         if(txip->end <= txip->start)
1821             continue;
1822         if(buf[0] != '\0')
1823         {
1824             *q++ = sep;
1825             *q++ = ' ';
1826         }
1827         for(i = txip->start, p = entry + txip->start; i < txip->end; i++)
1828             *q++ = *p++;
1829         *q = '\0';
1830     }
1831     XMLRestoreSpecialCharacters(buf);
1832     return(buf);
1833 }
1834 
1835 END_NCBI_SCOPE
1836