1 /* xm_index.c
2 *
3 * ===========================================================================
4 *
5 * PUBLIC DOMAIN NOTICE
6 * National Center for Biotechnology Information
7 *
8 * This software/database is a "United States Government Work" under the
9 * terms of the United States Copyright Act. It was written as part of
10 * the author's official duties as a United States Government employee and
11 * thus cannot be copyrighted. This software/database is freely available
12 * to the public for use. The National Library of Medicine and the U.S.
13 * Government have not placed any restriction on its use or reproduction.
14 *
15 * Although all reasonable efforts have been taken to ensure the accuracy
16 * and reliability of the software and data, the NLM and the U.S.
17 * Government do not and cannot warrant the performance or results that
18 * may be obtained by using this software or data. The NLM and the U.S.
19 * Government disclaim all warranties, express or implied, including
20 * warranties of performance, merchantability or fitness for any particular
21 * purpose.
22 *
23 * Please cite the author in any work or product based on this material.
24 *
25 * ===========================================================================
26 *
27 * File Name: xm_index.c
28 *
29 * Author: Sergey Bazhin
30 *
31 * File Description:
32 * -----------------
33 * Parsing flat records to memory blocks in XML format.
34 *
35 */
36 #include <ncbi_pch.hpp>
37
38 #include "ftacpp.hpp"
39 #include "index.h"
40
41 #include "ftaerr.hpp"
42 #include "indx_blk.h"
43 #include "indx_def.h"
44 #include "utilfun.h"
45 #include "fta_xml.h"
46
47 #ifdef THIS_FILE
48 # undef THIS_FILE
49 #endif
50 #define THIS_FILE "xm_index.cpp"
51
52 #define XML_FAKE_ACC_TAG "AC "
53
54 BEGIN_NCBI_SCOPE
55
56 typedef struct _XmlKwordBlk {
57 const char *str;
58 Int4 order;
59 Int4 tag;
60 } XmlKwordBlk, *XmlKwordBlkPtr;
61
62 XmlKwordBlk xmkwl[] = {
63 {"<INSDSeq_locus>", 1, INSDSEQ_LOCUS},
64 {"<INSDSeq_length>", 2, INSDSEQ_LENGTH},
65 {"<INSDSeq_strandedness>", 3, INSDSEQ_STRANDEDNESS},
66 {"<INSDSeq_moltype>", 4, INSDSEQ_MOLTYPE},
67 {"<INSDSeq_topology>", 5, INSDSEQ_TOPOLOGY},
68 {"<INSDSeq_division>", 6, INSDSEQ_DIVISION},
69 {"<INSDSeq_update-date>", 7, INSDSEQ_UPDATE_DATE},
70 {"<INSDSeq_create-date>", 8, INSDSEQ_CREATE_DATE},
71 {"<INSDSeq_update-release>", 9, INSDSEQ_UPDATE_RELEASE},
72 {"<INSDSeq_create-release>", 10, INSDSEQ_CREATE_RELEASE},
73 {"<INSDSeq_definition>", 11, INSDSEQ_DEFINITION},
74 {"<INSDSeq_primary-accession>", 12, INSDSEQ_PRIMARY_ACCESSION},
75 {"<INSDSeq_entry-version>", 13, INSDSEQ_ENTRY_VERSION},
76 {"<INSDSeq_accession-version>", 14, INSDSEQ_ACCESSION_VERSION},
77 {"<INSDSeq_other-seqids>", 15, INSDSEQ_OTHER_SEQIDS},
78 {"<INSDSeq_secondary-accessions>", 16, INSDSEQ_SECONDARY_ACCESSIONS},
79 {"<INSDSeq_keywords>", 17, INSDSEQ_KEYWORDS},
80 {"<INSDSeq_segment>", 18, INSDSEQ_SEGMENT},
81 {"<INSDSeq_source>", 19, INSDSEQ_SOURCE},
82 {"<INSDSeq_organism>", 20, INSDSEQ_ORGANISM},
83 {"<INSDSeq_taxonomy>", 21, INSDSEQ_TAXONOMY},
84 {"<INSDSeq_references>", 22, INSDSEQ_REFERENCES},
85 {"<INSDSeq_comment>", 23, INSDSEQ_COMMENT},
86 {"<INSDSeq_primary>", 24, INSDSEQ_PRIMARY},
87 {"<INSDSeq_source-db>", 25, INSDSEQ_SOURCE_DB},
88 {"<INSDSeq_database-reference>", 26, INSDSEQ_DATABASE_REFERENCE},
89 {"<INSDSeq_feature-table>", 27, INSDSEQ_FEATURE_TABLE},
90 {"<INSDSeq_sequence>", 28, INSDSEQ_SEQUENCE},
91 {"<INSDSeq_contig>", 29, INSDSEQ_CONTIG},
92 {NULL, -1, -1}
93 };
94
95 XmlKwordBlk xmfeatkwl[] = {
96 {"<INSDFeature_key>", 1, INSDFEATURE_KEY},
97 {"<INSDFeature_location>", 2, INSDFEATURE_LOCATION},
98 {"<INSDFeature_intervals>", 3, INSDFEATURE_INTERVALS},
99 {"<INSDFeature_quals>", 4, INSDFEATURE_QUALS},
100 {NULL, -1, -1}
101 };
102
103 XmlKwordBlk xmintkwl[] = {
104 {"<INSDInterval_from>", 1, INSDINTERVAL_FROM},
105 {"<INSDInterval_to>", 2, INSDINTERVAL_TO},
106 {"<INSDInterval_point>", 3, INSDINTERVAL_POINT},
107 {"<INSDInterval_accession>", 4, INSDINTERVAL_ACCESSION},
108 {NULL, -1, -1}
109 };
110
111 XmlKwordBlk xmrefkwl[] = {
112 {"<INSDReference_reference>", 1, INSDREFERENCE_REFERENCE},
113 {"<INSDReference_position>", 2, INSDREFERENCE_POSITION},
114 {"<INSDReference_authors>", 3, INSDREFERENCE_AUTHORS},
115 {"<INSDReference_consortium>", 4, INSDREFERENCE_CONSORTIUM},
116 {"<INSDReference_title>", 5, INSDREFERENCE_TITLE},
117 {"<INSDReference_journal>", 6, INSDREFERENCE_JOURNAL},
118 {"<INSDReference_xref>", 7, INSDREFERENCE_XREF},
119 {"<INSDReference_medline>", 8, INSDREFERENCE_MEDLINE},
120 {"<INSDReference_pubmed>", 9, INSDREFERENCE_PUBMED},
121 {"<INSDReference_remark>", 10, INSDREFERENCE_REMARK},
122 {NULL, -1, -1}
123 };
124
125 XmlKwordBlk xmqualkwl[] = {
126 {"<INSDQualifier_name>", 1, INSDQUALIFIER_NAME},
127 {"<INSDQualifier_value>", 2, INSDQUALIFIER_VALUE},
128 {NULL, -1, -1}
129 };
130
131 XmlKwordBlk xmxrefkwl[] = {
132 {"<INSDXref_dbname>", 1, INSDXREF_DBNAME},
133 {"<INSDXref_id>", 2, INSDXREF_ID},
134 {NULL, -1, -1}
135 };
136
137 XmlKwordBlk xmsubkwl[] = {
138 {"<INSDSecondary-accn>", 1, INSDSECONDARY_ACCN},
139 {"<INSDKeyword>", 1, INSDKEYWORD},
140 {"<INSDFeature>", 1, INSDFEATURE},
141 {"<INSDInterval>", 1, INSDINTERVAL},
142 {"<INSDQualifier>", 1, INSDQUALIFIER},
143 {"<INSDReference>", 1, INSDREFERENCE},
144 {"<INSDAuthor>", 1, INSDAUTHOR},
145 {"<INSDXref>", 1, INSDXREF},
146 {NULL, -1, -1}
147 };
148
149 /**********************************************************/
XMLIndexNew(void)150 static XmlIndexPtr XMLIndexNew(void)
151 {
152 XmlIndexPtr xip;
153
154 xip = (XmlIndexPtr) MemNew(sizeof(XmlIndex));
155 xip->tag = -1;
156 xip->order = -1;
157 xip->start = 0;
158 xip->end = 0;
159 xip->start_line = -1;
160 xip->end_line = -1;
161 xip->subtags = NULL;
162 xip->next = NULL;
163 return(xip);
164 }
165
166 /**********************************************************/
XMLRestoreSpecialCharacters(char * buf)167 static void XMLRestoreSpecialCharacters(char* buf)
168 {
169 char* p;
170 char* q;
171
172 for(p = buf, q = buf; *p != '\0';)
173 {
174 if(StringNCmp(p, "<", 4) == 0)
175 {
176 *q++ = '<';
177 p += 4;
178 }
179 else if(StringNCmp(p, ">", 4) == 0)
180 {
181 *q++ = '>';
182 p += 4;
183 }
184 else if(StringNCmp(p, "&", 5) == 0)
185 {
186 *q++ = '&';
187 p += 5;
188 }
189 else if(StringNCmp(p, "'", 6) == 0)
190 {
191 *q++ = '\'';
192 p += 6;
193 }
194 else if(StringNCmp(p, """, 6) == 0)
195 {
196 *q++ = '\"';
197 p += 6;
198 }
199 else
200 *q++ = *p++;
201 }
202 *q = '\0';
203 }
204
205 /**********************************************************/
XMLGetTagValue(char * entry,XmlIndexPtr xip)206 char* XMLGetTagValue(char* entry, XmlIndexPtr xip)
207 {
208 if(entry == NULL || xip == NULL || xip->start == 0 || xip->end == 0 ||
209 xip->start >= xip->end)
210 return(NULL);
211
212 size_t i = xip->end - xip->start;
213 char* buf = (char*)MemNew(i + 1);
214 StringNCpy(buf, entry + xip->start, i);
215 buf[i] = '\0';
216
217 XMLRestoreSpecialCharacters(buf);
218 return(buf);
219 }
220
221 /**********************************************************/
XMLFindTagValue(char * entry,XmlIndexPtr xip,Int4 tag)222 char* XMLFindTagValue(char* entry, XmlIndexPtr xip, Int4 tag)
223 {
224 for(; xip != NULL; xip = xip->next)
225 if(xip->tag == tag)
226 break;
227 if(xip == NULL)
228 return(NULL);
229 return(XMLGetTagValue(entry, xip));
230 }
231
232 /**********************************************************/
XMLDelSegnum(IndexblkPtr ibp,char * segnum,size_t len2)233 static bool XMLDelSegnum(IndexblkPtr ibp, char* segnum, size_t len2)
234 {
235 if(segnum == NULL)
236 return false;
237 size_t len1 = StringLen(segnum);
238 if(len2 < len1)
239 return false;
240
241 /* check, is there enough digits to delete
242 */
243 size_t tlen = len1;
244 char* str = ibp->blocusname;
245 size_t i = StringLen(str) - 1;
246 for(; tlen > 0 && str[i] >= '0' && str[i] <= '9'; i--)
247 tlen--;
248
249 if(tlen != 0 || i < 0)
250 return false;
251
252 if(len2 > len1 && str[i] == '0')
253 {
254 /* check, is there enough "0" appended
255 */
256 for(tlen = len2 - len1; tlen > 0 && str[i] == '0'; i--)
257 tlen--;
258
259 if(tlen != 0)
260 return false;
261 }
262
263 char* p;
264 char* q;
265 for (q = &str[i + 1], p = q; *p == '0';)
266 p++;
267
268 i = atoi(segnum);
269 if(atoi(p) != (int) i)
270 {
271 ErrPostEx(SEV_REJECT, ERR_SEGMENT_BadLocusName,
272 "Segment suffix in locus name \"%s\" does not match number in <INSDSEQ_segment> line = \"%d\". Entry dropped.",
273 str, i);
274 ibp->drop = 1;
275 }
276
277 *q = '\0'; /* strip off "len" characters */
278 return true;
279 }
280
281 /**********************************************************/
XMLGetSegment(char * entry,IndexblkPtr ibp)282 static void XMLGetSegment(char* entry, IndexblkPtr ibp)
283 {
284 TokenStatBlkPtr stoken;
285 XmlIndexPtr xip;
286 char* buf;
287 char* segnum;
288 char* segtotal;
289
290 if(entry == NULL || ibp == NULL || ibp->xip == NULL)
291 return;
292
293 for(xip = ibp->xip; xip != NULL; xip = xip->next)
294 if(xip->tag == INSDSEQ_SEGMENT)
295 break;
296 if(xip == NULL)
297 return;
298
299 buf = XMLGetTagValue(entry, xip);
300 if(buf == NULL)
301 return;
302
303 stoken = TokenString(buf, ' ');
304
305 if(stoken->num > 2)
306 {
307 segnum = stoken->list->str;
308 segtotal = stoken->list->next->next->str;
309 ibp->segnum = (Uint2) atoi(segnum);
310
311 if (!XMLDelSegnum(ibp, segnum, StringLen(segtotal)))
312 {
313 ErrPostEx(SEV_ERROR, ERR_SEGMENT_BadLocusName,
314 "Bad locus name \"%s\".", ibp->blocusname);
315 }
316
317 ibp->segtotal = (Uint2) atoi(segtotal);
318 }
319 else
320 {
321 ErrPostEx(SEV_ERROR, ERR_SEGMENT_IncompSeg,
322 "Incomplete Segment information at line %d.",
323 xip->start_line);
324 }
325
326 FreeTokenstatblk(stoken);
327 MemFree(buf);
328 }
329
330
s_HasInput(const Parser & config)331 static bool s_HasInput(const Parser& config) {
332 return (config.ffbuf.start != nullptr);
333 }
334
335
s_GetCharAndAdvance(Parser & config)336 static int s_GetCharAndAdvance(Parser& config) {
337 if (*config.ffbuf.current == '\0') {
338 return -1;
339 }
340 return *(config.ffbuf.current++);
341 }
342
s_SetPointer(Parser & config,int offset)343 void s_SetPointer(Parser& config, int offset) {
344 config.ffbuf.current = config.ffbuf.start + offset;
345 }
346
347 /**********************************************************/
XMLPerformIndex(ParserPtr pp)348 static void XMLPerformIndex(ParserPtr pp)
349 {
350 XmlKwordBlkPtr xkbp;
351 IndBlkNextPtr ibnp;
352 IndBlkNextPtr tibnp;
353 XmlIndexPtr xip;
354 IndexblkPtr ibp;
355 char* p;
356 Char s[60];
357 Char ch;
358 size_t count;
359 Int4 line;
360 Int4 c;
361 Int4 i;
362
363
364 if (!pp || !s_HasInput(*pp)) {
365 return;
366 }
367
368 c = 0;
369 s[0] = '\0';
370 bool within = false;
371 tibnp = NULL;
372 ibnp = NULL;
373 ibp = NULL;
374 xip = NULL;
375 pp->indx = 0;
376 size_t start_len = StringLen(INSDSEQ_START);
377 for(count = 0, line = 1;;)
378 {
379 if(c != '<')
380 {
381 c = s_GetCharAndAdvance(*pp);
382 if(c < 0)
383 break;
384 count++;
385 if((Char) c == '\n')
386 line++;
387 }
388 if(c != '<')
389 continue;
390
391 s[0] = '<';
392 for(i = 1; i < 50; i++)
393 {
394 c = s_GetCharAndAdvance(*pp);
395 if(c < 0)
396 break;
397 count++;
398 ch = (Char) c;
399 if(ch == '\n')
400 line++;
401 s[i] = ch;
402 if(ch == '<' || ch == '>')
403 break;
404 }
405 if(c < 0)
406 break;
407 if(ch == '<')
408 continue;
409 s[++i] = '\0';
410 if(StringCmp(s, INSDSEQ_START) == 0)
411 {
412 if (within)
413 continue;
414
415 within = true;
416 if(ibnp == NULL)
417 {
418 ibnp = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
419 tibnp = ibnp;
420 }
421 else
422 {
423 tibnp->next = (IndBlkNextPtr) MemNew(sizeof(IndBlkNext));
424 tibnp = tibnp->next;
425 }
426 tibnp->next = NULL;
427 tibnp->ibp = new Indexblk;
428 ibp = tibnp->ibp;
429 ibp->xip = NULL;
430 ibp->offset = count - start_len;
431 ibp->linenum = line;
432 ibp->len = 0;
433 pp->indx++;
434 continue;
435 }
436 if (!within)
437 {
438 if(StringCmp(s, INSDSEQ_END) == 0)
439 ErrPostEx(SEV_ERROR, ERR_FORMAT_UnexpectedEnd,
440 "Unexpected end tag \"%s\" of XML record found at line %d.",
441 s, line);
442 continue;
443 }
444 if(StringCmp(s, INSDSEQ_END) == 0)
445 {
446 ibp->len = count - ibp->offset;
447 within = false;
448 continue;
449 }
450 p = s + ((s[1] == '/') ? 2 : 1);
451 for(xkbp = xmkwl; xkbp->str != NULL; xkbp++)
452 if(StringCmp(p, xkbp->str + 1) == 0)
453 break;
454 if(xkbp->str == NULL)
455 continue;
456 if(ibp->xip == NULL || xip->tag != xkbp->tag)
457 {
458 if(ibp->xip == NULL)
459 {
460 ibp->xip = XMLIndexNew();
461 xip = ibp->xip;
462 }
463 else
464 {
465 xip->next = XMLIndexNew();
466 xip = xip->next;
467 }
468 xip->tag = xkbp->tag;
469 xip->order = xkbp->order;
470 if(s[1] == '/')
471 {
472 xip->end = count - i - ibp->offset;
473 xip->end_line = line;
474 }
475 else
476 {
477 xip->start = count - ibp->offset;
478 xip->start_line = line;
479 }
480 continue;
481 }
482 if(s[1] == '/')
483 {
484 if(xip->end != 0)
485 {
486 xip->next = XMLIndexNew();
487 xip = xip->next;
488 xip->tag = xkbp->tag;
489 xip->order = xkbp->order;
490 }
491 xip->end = count - i - ibp->offset;
492 xip->end_line = line;
493 }
494 else
495 {
496 if(xip->start != 0)
497 {
498 xip->next = XMLIndexNew();
499 xip = xip->next;
500 xip->tag = xkbp->tag;
501 xip->order = xkbp->order;
502 }
503 xip->start = count - ibp->offset;
504 xip->start_line = line;
505 }
506 }
507
508 pp->entrylist = (IndexblkPtr*) MemNew((pp->indx + 1) *
509 sizeof(IndexblkPtr));
510 for(tibnp = ibnp, i = 0; tibnp != NULL; i++, tibnp = ibnp)
511 {
512 pp->entrylist[i] = tibnp->ibp;
513 ibnp = tibnp->next;
514 MemFree(tibnp);
515 }
516 pp->entrylist[i] = NULL;
517 }
518
519 /**********************************************************/
XMLParseVersion(IndexblkPtr ibp,char * line)520 static void XMLParseVersion(IndexblkPtr ibp, char* line)
521 {
522 char* p;
523 char* q;
524
525 if(line == NULL)
526 {
527 ErrPostEx(SEV_FATAL, ERR_VERSION_BadVersionLine,
528 "Empty <INSDSeq_accession-version> line. Entry dropped.");
529 ibp->drop = 1;
530 return;
531 }
532
533 for(p = line; *p != '\0' && *p != ' ' && *p != '\t';)
534 p++;
535 if(*p != '\0')
536 {
537 ErrPostEx(SEV_FATAL, ERR_VERSION_BadVersionLine,
538 "Incorrect <INSDSeq_accession-version> line: \"%s\". Entry dropped.",
539 line);
540 ibp->drop = 1;
541 return;
542 }
543 q = StringRChr(line, '.');
544 if(q == NULL)
545 {
546 ErrPostEx(SEV_FATAL, ERR_VERSION_MissingVerNum,
547 "Missing version number in <INSDSeq_accession-version> line: \"%s\". Entry dropped.",
548 line);
549 ibp->drop = 1;
550 return;
551 }
552 for(p = q + 1; *p >= '0' && *p <= '9';)
553 p++;
554 if(*p != '\0')
555 {
556 ErrPostEx(SEV_FATAL, ERR_VERSION_NonDigitVerNum,
557 "Incorrect VERSION number in <INSDSeq_accession-version> line: \"%s\". Entry dropped.",
558 line);
559 ibp->drop = 1;
560 return;
561 }
562 *q = '\0';
563 if(ibp->acnum == NULL || StringCmp(ibp->acnum, line) != 0)
564 {
565 *q = '.';
566 ErrPostEx(SEV_FATAL, ERR_VERSION_AccessionsDontMatch,
567 "Accessions in <INSDSeq_accession-version> and <INSDSeq_primary-accession> lines don't match: \"%s\" vs \"%s\". Entry dropped.",
568 line, (ibp->acnum == NULL) ? "NULL" : ibp->acnum);
569 ibp->drop = 1;
570 return;
571 }
572 *q++ = '.';
573 ibp->vernum = atoi(q);
574
575 if(ibp->vernum > 0)
576 return;
577
578 ErrPostEx(SEV_FATAL, ERR_VERSION_InvalidVersion,
579 "Version number \"%d\" from Accession.Version value \"%s.%d\" is not a positive integer. Entry dropped.",
580 ibp->vernum, ibp->acnum, ibp->vernum);
581 ibp->drop = 1;
582 }
583
584 /**********************************************************/
XMLInitialEntry(IndexblkPtr ibp,char * entry,bool accver,Parser::ESource source)585 static void XMLInitialEntry(IndexblkPtr ibp, char* entry, bool accver,
586 Parser::ESource source)
587 {
588 XmlIndexPtr xip;
589 char* buf;
590
591 if(ibp == NULL || ibp->xip == NULL || entry == NULL)
592 return;
593 xip = ibp->xip;
594
595 if(source == Parser::ESource::USPTO)
596 ibp->is_pat = true;
597
598 ibp->locusname[0] = '\0';
599 ibp->acnum[0] = '\0';
600 for(xip = ibp->xip; xip != NULL; xip = xip->next)
601 {
602 if(xip->tag == INSDSEQ_LOCUS && ibp->locusname[0] == '\0')
603 {
604 if(xip->start == 0 || xip->end == 0 || xip->start >= xip->end ||
605 source == Parser::ESource::USPTO)
606 {
607 StringCpy(ibp->locusname, "???");
608 StringCpy(ibp->blocusname, "???");
609 continue;
610 }
611 size_t imax = xip->end - xip->start;
612 if(imax > (int) sizeof(ibp->locusname) - 1)
613 imax = sizeof(ibp->locusname) - 1;
614 StringNCpy(ibp->locusname, entry + xip->start, imax);
615 ibp->locusname[imax] = '\0';
616 StringCpy(ibp->blocusname, ibp->locusname);
617 }
618 else if(xip->tag == INSDSEQ_PRIMARY_ACCESSION && ibp->acnum[0] == '\0')
619 {
620 if(xip->start == 0 || xip->end == 0 || xip->start >= xip->end)
621 {
622 StringCpy(ibp->acnum, "???");
623 continue;
624 }
625 size_t imax = xip->end - xip->start;
626 if(imax > (int) sizeof(ibp->acnum) - 1)
627 imax = sizeof(ibp->acnum) - 1;
628 StringNCpy(ibp->acnum, entry + xip->start, imax);
629 ibp->acnum[imax] = '\0';
630 }
631 if(ibp->locusname[0] != '\0' && ibp->acnum[0] != '\0')
632 break;
633 }
634
635 FtaInstallPrefix(PREFIX_LOCUS, ibp->locusname, NULL);
636 if(ibp->acnum[0] == '\0')
637 FtaInstallPrefix(PREFIX_ACCESSION, ibp->locusname, NULL);
638 else
639 FtaInstallPrefix(PREFIX_ACCESSION, ibp->acnum, NULL);
640
641 if(accver)
642 {
643 for(xip = ibp->xip; xip != NULL; xip = xip->next)
644 {
645 if(xip->tag != INSDSEQ_ACCESSION_VERSION)
646 continue;
647 buf = XMLGetTagValue(entry, xip);
648 XMLParseVersion(ibp, buf);
649 if(buf != NULL)
650 {
651 FtaInstallPrefix(PREFIX_ACCESSION, buf, NULL);
652 MemFree(buf);
653 }
654 break;
655 }
656 }
657
658 ibp->bases = 0;
659 ibp->date = NULL;
660 StringCpy(ibp->division, "???");
661 for(xip = ibp->xip; xip != NULL; xip = xip->next)
662 {
663 if(xip->tag == INSDSEQ_LENGTH && ibp->bases == 0)
664 {
665 buf = XMLGetTagValue(entry, xip);
666 if(buf == NULL)
667 continue;
668 ibp->bases = (size_t) atoi(buf);
669 MemFree(buf);
670 }
671 else if(xip->tag == INSDSEQ_UPDATE_DATE && ibp->date == NULL)
672 {
673 buf = XMLGetTagValue(entry, xip);
674 if(buf == NULL)
675 continue;
676 ibp->date = GetUpdateDate(buf, source);
677 MemFree(buf);
678 }
679 else if(xip->tag == INSDSEQ_DIVISION && ibp->division[0] == '?')
680 {
681 if(xip->start == 0 || xip->end == 0 || xip->start >= xip->end ||
682 xip->end - xip->start < 3)
683 continue;
684 StringNCpy(ibp->division, entry + xip->start, 3);
685 ibp->division[3] = '\0';
686 if(StringICmp(ibp->division, "EST") == 0)
687 ibp->EST = true;
688 else if(StringCmp(ibp->division, "STS") == 0)
689 ibp->STS = true;
690 else if(StringCmp(ibp->division, "GSS") == 0)
691 ibp->GSS = true;
692 else if(StringCmp(ibp->division, "HTC") == 0)
693 ibp->HTC = true;
694 }
695 if(ibp->bases > 0 && ibp->date != NULL && ibp->division[0] != '?')
696 break;
697 }
698 }
699
700 /**********************************************************/
XMLStringByTag(XmlKwordBlkPtr xkbp,Int4 tag)701 static const char *XMLStringByTag(XmlKwordBlkPtr xkbp, Int4 tag)
702 {
703 for(; xkbp->str != NULL; xkbp++)
704 if(xkbp->tag == tag)
705 break;
706 if(xkbp->str == NULL)
707 return("???");
708 return(xkbp->str);
709 }
710
711 /**********************************************************/
XMLTagCheck(XmlIndexPtr xip,XmlKwordBlkPtr xkbp)712 static bool XMLTagCheck(XmlIndexPtr xip, XmlKwordBlkPtr xkbp)
713 {
714 XmlIndexPtr txip;
715 bool ret = true;
716 for(txip = xip; txip != NULL; txip = txip->next)
717 {
718 if(txip->start == 0)
719 {
720 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingStartTag,
721 "XML record's missing start tag for \"%s\" at line %d.",
722 XMLStringByTag(xkbp, txip->tag), txip->end_line);
723 ret = false;
724 }
725 if(txip->end == 0)
726 {
727 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingEndTag,
728 "XML record's missing end tag for \"%s\" at line %d.",
729 XMLStringByTag(xkbp, txip->tag), txip->start_line);
730 ret = false;
731 }
732 if(txip->next != NULL && txip->order >= txip->next->order)
733 {
734 ErrPostEx(SEV_ERROR, ERR_FORMAT_LineTypeOrder,
735 "XML tag \"%s\" at line %d is out of order.",
736 XMLStringByTag(xkbp, txip->next->tag),
737 (txip->next->start > 0) ? txip->next->start_line :
738 txip->next->end_line);
739 ret = false;
740 }
741 }
742 return(ret);
743 }
744
745 /**********************************************************/
XMLSameTagsCheck(XmlIndexPtr xip,char * name)746 static bool XMLSameTagsCheck(XmlIndexPtr xip, char* name)
747 {
748 bool ret = true;
749
750 for (XmlIndexPtr txip = xip; txip != NULL; txip = txip->next)
751 {
752 if(txip->start == 0)
753 {
754 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingStartTag,
755 "XML record's missing start tag for \"%s\" at line %d.",
756 name, txip->end_line);
757 ret = false;
758 }
759 if(txip->end == 0)
760 {
761 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLMissingEndTag,
762 "XML record's missing end tag for \"%s\" at line %d.",
763 name, txip->start_line);
764 ret = false;
765 }
766 }
767 return(ret);
768 }
769
770 /**********************************************************/
XMLIndexSameSubTags(char * entry,XmlIndexPtr xip,Int4 tag)771 static XmlIndexPtr XMLIndexSameSubTags(char* entry, XmlIndexPtr xip,
772 Int4 tag)
773 {
774 XmlIndexPtr xipsub;
775 XmlIndexPtr txipsub;
776 char* name;
777 char* c;
778 char* p;
779 size_t count;
780 Char s[60];
781 Int4 line;
782 Int4 i;
783
784 if(entry == NULL || xip == NULL)
785 return(NULL);
786
787 name = (char*) XMLStringByTag(xmsubkwl, tag);
788 if(name == NULL)
789 return(NULL);
790
791 s[0] = '\0';
792 xipsub = NULL;
793 txipsub = NULL;
794 line = xip->start_line;
795 c = entry + xip->start;
796 for(count = xip->start + 1;;)
797 {
798 if(*c != '<')
799 {
800 c++;
801 count++;
802 if(*c == '\0' || count > xip->end)
803 break;
804 if(*c == '\n')
805 line++;
806 }
807 if(*c != '<')
808 continue;
809
810 for(s[0] = '<', i = 1; i < 50; i++)
811 {
812 c++;
813 count++;
814 if(*c == '\0' || count > xip->end)
815 break;
816 if(*c == '\n')
817 line++;
818 s[i] = *c;
819 if(*c == '<' || *c == '>')
820 break;
821 }
822 if(*c == '\0' || count > xip->end)
823 break;
824 if(*c == '<')
825 continue;
826 s[++i] = '\0';
827 p = s + ((s[1] == '/') ? 2 : 1);
828 if(StringCmp(p, name + 1) != 0)
829 continue;
830
831 if(xipsub == NULL)
832 {
833 xipsub = XMLIndexNew();
834 txipsub = xipsub;
835 }
836 else if((s[1] != '/' && txipsub->start != 0) ||
837 (s[1] == '/' && txipsub->end != 0))
838 {
839 txipsub->next = XMLIndexNew();
840 txipsub = txipsub->next;
841 }
842 if(s[1] == '/')
843 {
844 txipsub->end = count - i;
845 txipsub->end_line = line;
846 }
847 else
848 {
849 txipsub->start = count;
850 txipsub->start_line = line;
851 }
852 txipsub->tag = tag;
853 }
854
855 if(XMLSameTagsCheck(xipsub, name))
856 return(xipsub);
857
858 XMLIndexFree(xipsub);
859 return(NULL);
860 }
861
862 /**********************************************************/
XMLAccessionsCheck(ParserPtr pp,IndexblkPtr ibp,char * entry)863 static bool XMLAccessionsCheck(ParserPtr pp, IndexblkPtr ibp, char* entry)
864 {
865 XmlIndexPtr xip;
866 XmlIndexPtr xipsec;
867 char* buf;
868 char* p;
869
870 bool ret = true;
871 size_t len = StringLen(ibp->acnum) + StringLen(XML_FAKE_ACC_TAG) + 1;
872
873 for(xip = ibp->xip; xip != NULL; xip = xip->next)
874 if(xip->tag == INSDSEQ_SECONDARY_ACCESSIONS)
875 break;
876
877 if(xip == NULL)
878 {
879 buf = (char*) MemNew(len);
880 StringCpy(buf, XML_FAKE_ACC_TAG);
881 StringCat(buf, ibp->acnum);
882 ret = GetAccession(pp, buf, ibp, 2);
883 MemFree(buf);
884 return(ret);
885 }
886
887 xip->subtags = XMLIndexSameSubTags(entry, xip, INSDSECONDARY_ACCN);
888 if(xip->subtags == NULL)
889 {
890 p = (char*) XMLStringByTag(xmkwl, INSDSEQ_SECONDARY_ACCESSIONS);
891 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
892 "Incorrectly formatted \"%s\" XML block. Entry dropped.", p);
893 ibp->drop = 1;
894 return false;
895 }
896
897 for(xipsec = xip->subtags; xipsec != NULL; xipsec = xipsec->next)
898 len += (xipsec->end - xipsec->start + 1);
899
900 buf = (char*) MemNew(len);
901 StringCpy(buf, XML_FAKE_ACC_TAG);
902 StringCat(buf, ibp->acnum);
903 for(xipsec = xip->subtags; xipsec != NULL; xipsec = xipsec->next)
904 {
905 p = XMLGetTagValue(entry, xipsec);
906 if(p == NULL)
907 continue;
908 StringCat(buf, " ");
909 StringCat(buf, p);
910 MemFree(p);
911 }
912 ret = GetAccession(pp, buf, ibp, 2);
913 MemFree(buf);
914 return(ret);
915 }
916
917 /**********************************************************/
XMLKeywordsCheck(char * entry,IndexblkPtr ibp,Parser::ESource source)918 static bool XMLKeywordsCheck(char* entry, IndexblkPtr ibp, Parser::ESource source)
919 {
920 XmlIndexPtr xip;
921 XmlIndexPtr xipkwd;
922 ValNodePtr vnp;
923 char* buf;
924 char* p;
925
926 bool tpa_check = (source == Parser::ESource::EMBL);
927
928 if(entry == NULL || ibp == NULL || ibp->xip == NULL)
929 return true;
930
931 for(xip = ibp->xip; xip != NULL; xip = xip->next)
932 if(xip->tag == INSDSEQ_KEYWORDS)
933 break;
934 if(xip == NULL)
935 return true;
936
937 xip->subtags = XMLIndexSameSubTags(entry, xip, INSDKEYWORD);
938 if(xip->subtags == NULL)
939 {
940 p = (char*) XMLStringByTag(xmkwl, INSDSEQ_KEYWORDS);
941 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
942 "Incorrectly formatted \"%s\" XML block. Entry dropped.", p);
943 ibp->drop = 1;
944 return false;
945 }
946
947 size_t len = 0;
948 for(xipkwd = xip->subtags; xipkwd != NULL; xipkwd = xipkwd->next)
949 len += (xipkwd->end - xipkwd->start + 2);
950
951 buf = (char*) MemNew(len);
952 *buf = '\0';
953 for(xipkwd = xip->subtags; xipkwd != NULL; xipkwd = xipkwd->next)
954 {
955 p = XMLGetTagValue(entry, xipkwd);
956 if(p == NULL)
957 continue;
958 if(*buf != '\0')
959 StringCat(buf, "; ");
960 StringCat(buf, p);
961 MemFree(p);
962 }
963
964 vnp = ConstructValNode(NULL, 0, buf);
965 check_est_sts_gss_tpa_kwds(vnp, len, ibp, tpa_check, ibp->specialist_db,
966 ibp->inferential, ibp->experimental,
967 ibp->assembly);
968 MemFree(buf);
969 MemFree(vnp);
970 return true;
971 }
972
973 /**********************************************************/
XMLErrField(Int4 tag)974 static bool XMLErrField(Int4 tag)
975 {
976 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
977 "No %s data in XML format file. Entry dropped.",
978 XMLStringByTag(xmkwl, tag));
979 return false;
980 }
981
982 /**********************************************************/
XMLCheckRequiredTags(ParserPtr pp,IndexblkPtr ibp)983 static bool XMLCheckRequiredTags(ParserPtr pp, IndexblkPtr ibp)
984 {
985 XmlIndexPtr xip;
986 bool got_locus = false;
987 bool got_length = false;
988 bool got_moltype = false;
989 bool got_division = false;
990 bool got_update_date = false;
991 bool got_definition = false;
992 bool got_accession = false;
993 bool got_version = false;
994 bool got_source = false;
995 bool got_organism = false;
996 bool got_reference = false;
997 bool got_primary = false;
998 bool got_features = false;
999 bool ret = true;
1000
1001 ibp->origin = false;
1002 ibp->is_contig = false;
1003 for(xip = ibp->xip; xip != NULL; xip = xip->next)
1004 {
1005 if(xip->tag == INSDSEQ_LOCUS && pp->source != Parser::ESource::USPTO)
1006 got_locus = true;
1007 else if(xip->tag == INSDSEQ_LENGTH)
1008 got_length = true;
1009 else if(xip->tag == INSDSEQ_MOLTYPE)
1010 got_moltype = true;
1011 else if(xip->tag == INSDSEQ_DIVISION)
1012 got_division = true;
1013 else if(xip->tag == INSDSEQ_UPDATE_DATE)
1014 got_update_date = true;
1015 else if(xip->tag == INSDSEQ_DEFINITION)
1016 got_definition = true;
1017 else if(xip->tag == INSDSEQ_PRIMARY_ACCESSION)
1018 got_accession = true;
1019 else if(xip->tag == INSDSEQ_ACCESSION_VERSION)
1020 got_version = true;
1021 else if(xip->tag == INSDSEQ_SOURCE)
1022 got_source = true;
1023 else if(xip->tag == INSDSEQ_ORGANISM)
1024 got_organism = true;
1025 else if(xip->tag == INSDSEQ_REFERENCES)
1026 got_reference = true;
1027 else if(xip->tag == INSDSEQ_PRIMARY)
1028 got_primary = true;
1029 else if(xip->tag == INSDSEQ_FEATURE_TABLE)
1030 got_features = true;
1031 else if(xip->tag == INSDSEQ_CONTIG)
1032 ibp->is_contig = true;
1033 else if(xip->tag == INSDSEQ_SEQUENCE)
1034 ibp->origin = true;
1035 }
1036
1037 if(got_locus == false && pp->source != Parser::ESource::USPTO)
1038 ret = XMLErrField(INSDSEQ_LOCUS);
1039 if(got_length == false)
1040 ret = XMLErrField(INSDSEQ_LENGTH);
1041 if(got_moltype == false)
1042 ret = XMLErrField(INSDSEQ_MOLTYPE);
1043 if(got_division == false)
1044 ret = XMLErrField(INSDSEQ_DIVISION);
1045 if(got_update_date == false && pp->source != Parser::ESource::USPTO)
1046 ret = XMLErrField(INSDSEQ_UPDATE_DATE);
1047 if(got_definition == false)
1048 ret = XMLErrField(INSDSEQ_DEFINITION);
1049 if(got_accession == false)
1050 {
1051 ErrPostEx(SEV_ERROR, ERR_ACCESSION_NoAccessNum,
1052 "No accession number for this record. Entry dropped.");
1053 ret = false;
1054 }
1055 if(got_version == false)
1056 {
1057 if(pp->accver != false)
1058 ret = XMLErrField(INSDSEQ_ACCESSION_VERSION);
1059 }
1060 else if(pp->source == Parser::ESource::USPTO)
1061 {
1062 ErrPostEx(SEV_REJECT, ERR_ENTRY_InvalidLineType,
1063 "Line type %s is not allowed for USPTO records. Entry dropped.",
1064 XMLStringByTag(xmkwl, INSDSEQ_PRIMARY));
1065 ret = false;
1066 }
1067 if(got_source == false)
1068 ret = XMLErrField(INSDSEQ_SOURCE);
1069 if(got_organism == false)
1070 ret = XMLErrField(INSDSEQ_ORGANISM);
1071 if(got_reference == false && pp->source != Parser::ESource::Flybase &&
1072 ibp->is_wgs == false &&
1073 (pp->source != Parser::ESource::Refseq ||
1074 StringNCmp(ibp->acnum, "NW_", 3) != 0))
1075 ret = XMLErrField(INSDSEQ_REFERENCES);
1076 if (got_primary && ibp->is_tpa == false && ibp->tsa_allowed == false)
1077 {
1078 ErrPostEx(SEV_ERROR, ERR_ENTRY_InvalidLineType,
1079 "Line type %s is allowed for TPA or TSA records only. Continue anyway.",
1080 XMLStringByTag(xmkwl, INSDSEQ_PRIMARY));
1081 }
1082 if(got_features == false)
1083 ret = XMLErrField(INSDSEQ_FEATURE_TABLE);
1084 if(ibp->is_contig && ibp->segnum != 0)
1085 {
1086 ErrPostEx(SEV_ERROR, ERR_FORMAT_ContigInSegset,
1087 "%s data are not allowed for members of segmented sets. Entry dropped.",
1088 XMLStringByTag(xmkwl, INSDSEQ_CONTIG));
1089 ret = false;
1090 }
1091
1092 ibp->is_tpa_wgs_con = (ibp->is_contig && ibp->is_wgs && ibp->is_tpa);
1093
1094 return(ret);
1095 }
1096
1097 /**********************************************************/
XMLLoadEntry(ParserPtr pp,bool err)1098 char* XMLLoadEntry(ParserPtr pp, bool err)
1099 {
1100 IndexblkPtr ibp;
1101 char* entry;
1102 char* p;
1103 size_t i;
1104 Int4 c;
1105
1106 if (!pp || !s_HasInput(*pp)) {
1107 return nullptr;
1108 }
1109
1110 ibp = pp->entrylist[pp->curindx];
1111 if(ibp == NULL || ibp->len == 0)
1112 return(NULL);
1113
1114 entry = (char*) MemNew(ibp->len + 1);
1115 s_SetPointer(*pp, ibp->offset);
1116
1117
1118 for(p = entry, i = 0; i < ibp->len; i++)
1119 {
1120 c = s_GetCharAndAdvance(*pp);
1121 if(c < 0)
1122 break;
1123 if (c == 13) {
1124 c = 10;
1125 }
1126 if(c > 126 || (c < 32 && c != 10))
1127 {
1128 if (err)
1129 ErrPostEx(SEV_WARNING, ERR_FORMAT_NonAsciiChar,
1130 "None-ASCII character within the record which begins at line %d, decimal value %d, replaced by #.",
1131 ibp->linenum, c);
1132 *p++ = '#';
1133 }
1134 else
1135 *p++ = (Char) c;
1136 }
1137 if(i != ibp->len)
1138 {
1139 MemFree(entry);
1140 return(NULL);
1141 }
1142 *p = '\0';
1143
1144 return(entry);
1145 }
1146
1147
1148 /**********************************************************/
XMLIndexSubTags(char * entry,XmlIndexPtr xip,XmlKwordBlkPtr xkbp)1149 static bool XMLIndexSubTags(char* entry, XmlIndexPtr xip, XmlKwordBlkPtr xkbp)
1150 {
1151 XmlKwordBlkPtr txkbp;
1152 XmlIndexPtr xipsub;
1153 char* c;
1154 char* p;
1155 Char s[60];
1156 size_t count;
1157 Int4 line;
1158 Int4 i;
1159
1160 if(entry == NULL || xip == NULL)
1161 return false;
1162
1163 s[0] = '\0';
1164 xipsub = NULL;
1165 line = xip->start_line;
1166 c = entry + xip->start;
1167 for(count = xip->start + 1;;)
1168 {
1169 if(*c != '<')
1170 {
1171 c++;
1172 count++;
1173 if(*c == '\0' || count > xip->end)
1174 break;
1175 if(*c == '\n')
1176 line++;
1177 }
1178 if(*c != '<')
1179 continue;
1180
1181 for(s[0] = '<', i = 1; i < 50; i++)
1182 {
1183 c++;
1184 count++;
1185 if(*c == '\0' || count > xip->end)
1186 break;
1187 if(*c == '\n')
1188 line++;
1189 s[i] = *c;
1190 if(*c == '<' || *c == '>')
1191 break;
1192 }
1193 if(*c == '\0' || count > xip->end)
1194 break;
1195 if(*c == '<')
1196 continue;
1197 s[++i] = '\0';
1198 p = s + ((s[1] == '/') ? 2 : 1);
1199 for(txkbp = xkbp; txkbp->str != NULL; txkbp++)
1200 if(StringCmp(p, txkbp->str + 1) == 0)
1201 break;
1202 if(txkbp->str == NULL)
1203 continue;
1204 if(xipsub == NULL || xipsub->tag != txkbp->tag)
1205 {
1206 if(xipsub == NULL)
1207 {
1208 xipsub = XMLIndexNew();
1209 xip->subtags = xipsub;
1210 }
1211 else
1212 {
1213 xipsub->next = XMLIndexNew();
1214 xipsub = xipsub->next;
1215 }
1216 xipsub->tag = txkbp->tag;
1217 xipsub->order = txkbp->order;
1218 if(s[1] == '/')
1219 {
1220 xipsub->end = count - i;
1221 xipsub->end_line = line;
1222 }
1223 else
1224 {
1225 xipsub->start = count;
1226 xipsub->start_line = line;
1227 }
1228 continue;
1229 }
1230 if(s[1] == '/')
1231 {
1232 if(xipsub->end != 0)
1233 {
1234 xipsub->next = XMLIndexNew();
1235 xipsub = xipsub->next;
1236 xipsub->tag = txkbp->tag;
1237 xipsub->order = txkbp->order;
1238 }
1239 xipsub->end = count - i;
1240 xipsub->end_line = line;
1241 }
1242 else
1243 {
1244 if(xipsub->start != 0)
1245 {
1246 xipsub->next = XMLIndexNew();
1247 xipsub = xipsub->next;
1248 xipsub->tag = txkbp->tag;
1249 xipsub->order = txkbp->order;
1250 }
1251 xipsub->start = count;
1252 xipsub->start_line = line;
1253 }
1254 }
1255
1256 if (!XMLTagCheck(xip->subtags, xkbp))
1257 return false;
1258
1259 return true;
1260 }
1261
1262 /**********************************************************/
XMLCheckRequiredFeatTags(XmlIndexPtr xip)1263 static bool XMLCheckRequiredFeatTags(XmlIndexPtr xip)
1264 {
1265 bool got_key = false;
1266 bool got_location = false;
1267 bool ret = true;
1268
1269 for(; xip != NULL; xip = xip->next)
1270 {
1271 if(xip->tag == INSDFEATURE_KEY)
1272 got_key = true;
1273 else if(xip->tag == INSDFEATURE_LOCATION)
1274 got_location = true;
1275 }
1276
1277 if(!got_key)
1278 {
1279 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1280 "Feature table is missing %s data in XML format file.",
1281 XMLStringByTag(xmfeatkwl, INSDFEATURE_KEY));
1282 ret = false;
1283 }
1284
1285 if(!got_location)
1286 {
1287 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1288 "Feature table is missing %s data in XML format file.",
1289 XMLStringByTag(xmfeatkwl, INSDFEATURE_LOCATION));
1290 ret = false;
1291 }
1292 return(ret);
1293 }
1294
1295 /**********************************************************/
XMLCheckRequiredIntTags(XmlIndexPtr xip)1296 static bool XMLCheckRequiredIntTags(XmlIndexPtr xip)
1297 {
1298 bool got_from = false;
1299 bool got_to = false;
1300 bool got_point = false;
1301 bool got_accession = false;
1302 bool ret = true;
1303
1304 for(; xip != NULL; xip = xip->next)
1305 {
1306 if(xip->tag == INSDINTERVAL_FROM)
1307 got_from = true;
1308 else if(xip->tag == INSDINTERVAL_TO)
1309 got_to = true;
1310 else if(xip->tag == INSDINTERVAL_POINT)
1311 got_point = true;
1312 else if(xip->tag == INSDINTERVAL_ACCESSION)
1313 got_accession = true;
1314 }
1315
1316 if(!got_accession)
1317 {
1318 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1319 "Feature's interval block is missing %s data in XML format file.",
1320 XMLStringByTag(xmintkwl, INSDINTERVAL_ACCESSION));
1321 ret = false;
1322 }
1323
1324 if(got_point)
1325 {
1326 if(got_from || got_to)
1327 {
1328 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLInvalidINSDInterval,
1329 "%s tag cannot co-exist with %s or %s or both in XML format.",
1330 XMLStringByTag(xmintkwl, INSDINTERVAL_POINT),
1331 XMLStringByTag(xmintkwl, INSDINTERVAL_FROM),
1332 XMLStringByTag(xmintkwl, INSDINTERVAL_TO));
1333 ret = false;
1334 }
1335 }
1336 else if(got_from == false || got_to == false)
1337 {
1338 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLInvalidINSDInterval,
1339 "%s must contain either both of %s and %s, or %s.",
1340 XMLStringByTag(xmsubkwl, INSDINTERVAL),
1341 XMLStringByTag(xmintkwl, INSDINTERVAL_FROM),
1342 XMLStringByTag(xmintkwl, INSDINTERVAL_TO),
1343 XMLStringByTag(xmintkwl, INSDINTERVAL_POINT));
1344 ret = false;
1345 }
1346
1347 return(ret);
1348 }
1349
1350 /**********************************************************/
XMLCheckRequiredQualTags(XmlIndexPtr xip)1351 static bool XMLCheckRequiredQualTags(XmlIndexPtr xip)
1352 {
1353 for (; xip != NULL; xip = xip->next)
1354 {
1355 if (xip->tag == INSDQUALIFIER_NAME)
1356 break;
1357 }
1358
1359 if(xip != NULL)
1360 return true;
1361
1362 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1363 "Qualifier block is missing %s data in XML format file.",
1364 XMLStringByTag(xmqualkwl, INSDQUALIFIER_NAME));
1365 return false;
1366 }
1367
1368 /**********************************************************/
XMLIndexFeatures(char * entry,XmlIndexPtr xip)1369 static bool XMLIndexFeatures(char* entry, XmlIndexPtr xip)
1370 {
1371 XmlIndexPtr xipfeat;
1372 XmlIndexPtr xipsub;
1373 XmlIndexPtr txip;
1374
1375 if(xip == NULL || entry == NULL)
1376 return true;
1377
1378 for (; xip != NULL; xip = xip->next)
1379 {
1380 if (xip->tag == INSDSEQ_FEATURE_TABLE)
1381 break;
1382 }
1383
1384 if(xip == NULL)
1385 return true;
1386
1387 xip->subtags = XMLIndexSameSubTags(entry, xip, INSDFEATURE);
1388 if(xip->subtags == NULL)
1389 {
1390 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1391 "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1392 XMLStringByTag(xmkwl, INSDSEQ_FEATURE_TABLE));
1393 return false;
1394 }
1395
1396 for(xipfeat = xip->subtags; xipfeat != NULL; xipfeat = xipfeat->next)
1397 {
1398 if(XMLIndexSubTags(entry, xipfeat, xmfeatkwl) == false ||
1399 XMLCheckRequiredFeatTags(xipfeat->subtags) == false)
1400 break;
1401 for(txip = xipfeat->subtags; txip != NULL; txip = txip->next)
1402 {
1403 if(txip->tag == INSDFEATURE_INTERVALS)
1404 {
1405 txip->subtags = XMLIndexSameSubTags(entry, txip, INSDINTERVAL);
1406 if(txip->subtags == NULL)
1407 break;
1408 xipsub = txip->subtags;
1409 for(; xipsub != NULL; xipsub = xipsub->next)
1410 if(XMLIndexSubTags(entry, xipsub, xmintkwl) == false ||
1411 XMLCheckRequiredIntTags(xipsub->subtags) == false)
1412 break;
1413 }
1414 else if(txip->tag == INSDFEATURE_QUALS)
1415 {
1416 txip->subtags = XMLIndexSameSubTags(entry, txip,
1417 INSDQUALIFIER);
1418 if(txip->subtags == NULL)
1419 break;
1420 xipsub = txip->subtags;
1421 for(; xipsub != NULL; xipsub = xipsub->next)
1422 if(XMLIndexSubTags(entry, xipsub, xmqualkwl) == false ||
1423 XMLCheckRequiredQualTags(xipsub->subtags) == false)
1424 break;
1425 }
1426 }
1427 if(txip != NULL)
1428 break;
1429 }
1430
1431 if(xipfeat == NULL)
1432 return true;
1433
1434 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1435 "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1436 XMLStringByTag(xmkwl, INSDSEQ_FEATURE_TABLE));
1437 return false;
1438 }
1439
1440 /**********************************************************/
XMLCheckRequiredRefTags(XmlIndexPtr xip)1441 static bool XMLCheckRequiredRefTags(XmlIndexPtr xip)
1442 {
1443 bool got_reference = false;
1444 bool got_journal = false;
1445 bool ret = true;
1446
1447 for(; xip != NULL; xip = xip->next)
1448 {
1449 if(xip->tag == INSDREFERENCE_REFERENCE)
1450 got_reference = true;
1451 else if(xip->tag == INSDREFERENCE_JOURNAL)
1452 got_journal = true;
1453 }
1454
1455 if (!got_reference)
1456 {
1457 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1458 "%s block is missing %s data in XML format file.",
1459 XMLStringByTag(xmsubkwl, INSDREFERENCE),
1460 XMLStringByTag(xmrefkwl, INSDREFERENCE_REFERENCE));
1461 ret = false;
1462 }
1463
1464 if (!got_journal)
1465 {
1466 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1467 "%s block is missing %s data in XML format file.",
1468 XMLStringByTag(xmsubkwl, INSDREFERENCE),
1469 XMLStringByTag(xmrefkwl, INSDREFERENCE_JOURNAL));
1470 ret = false;
1471 }
1472 return(ret);
1473 }
1474
1475 /**********************************************************/
XMLGetRefTypePos(char * reftag,size_t bases)1476 static Int2 XMLGetRefTypePos(char* reftag, size_t bases)
1477 {
1478 Char str[100];
1479
1480 if(reftag == NULL || *reftag == '\0')
1481 return(ParFlat_REF_NO_TARGET);
1482
1483 sprintf(str, "1..%d", (int) bases);
1484
1485 if(StringCmp(reftag, str) == 0)
1486 return(ParFlat_REF_END);
1487 if(StringCmp(reftag, "sites") == 0)
1488 return(ParFlat_REF_SITES);
1489 return(ParFlat_REF_BTW);
1490 }
1491
1492 /**********************************************************/
XMLGetRefType(char * reftag,size_t bases)1493 static Int2 XMLGetRefType(char* reftag, size_t bases)
1494 {
1495 char* p;
1496 Char str[100];
1497 Char str1[100];
1498
1499 if(reftag == NULL)
1500 return(ParFlat_REF_NO_TARGET);
1501
1502 for(p = reftag; *p != '\0' && *p != '(';)
1503 p++;
1504 if(*p == '\0')
1505 return(ParFlat_REF_NO_TARGET);
1506
1507 sprintf(str, "(bases 1 to %d)", (int) bases);
1508 sprintf(str1, "(bases 1 to %d;", (int) bases);
1509
1510 if(StringStr(p, str) != NULL || StringStr(p, str1) != NULL)
1511 return(ParFlat_REF_END);
1512 if(StringStr(p, "(sites)") != NULL)
1513 return(ParFlat_REF_SITES);
1514 return(ParFlat_REF_BTW);
1515 }
1516
1517 /**********************************************************/
XMLCheckRequiredXrefTags(XmlIndexPtr xip)1518 static bool XMLCheckRequiredXrefTags(XmlIndexPtr xip)
1519 {
1520 bool got_dbname = false;
1521 bool got_id = false;
1522 bool ret = true;
1523
1524 for(; xip != NULL; xip = xip->next)
1525 {
1526 if(xip->tag == INSDXREF_DBNAME)
1527 got_dbname = true;
1528 else if(xip->tag == INSDXREF_ID)
1529 got_id = true;
1530 }
1531
1532 if (!got_dbname)
1533 {
1534 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1535 "%s block is missing %s data in XML format file.",
1536 XMLStringByTag(xmsubkwl, INSDXREF),
1537 XMLStringByTag(xmrefkwl, INSDXREF_DBNAME));
1538 ret = false;
1539 }
1540
1541 if (!got_id)
1542 {
1543 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingField,
1544 "%s block is missing %s data in XML format file.",
1545 XMLStringByTag(xmsubkwl, INSDXREF),
1546 XMLStringByTag(xmrefkwl, INSDXREF_ID));
1547 ret = false;
1548 }
1549 return(ret);
1550 }
1551
1552 /**********************************************************/
XMLIndexReferences(char * entry,XmlIndexPtr xip,size_t bases)1553 static bool XMLIndexReferences(char* entry, XmlIndexPtr xip, size_t bases)
1554 {
1555 XmlIndexPtr xipref;
1556 XmlIndexPtr txip;
1557 XmlIndexPtr xipsub;
1558 char* reftagref;
1559 char* reftagpos;
1560
1561 if(xip == NULL || entry == NULL)
1562 return true;
1563
1564 for (; xip != NULL; xip = xip->next)
1565 {
1566 if (xip->tag == INSDSEQ_REFERENCES)
1567 break;
1568 }
1569 if(xip == NULL)
1570 return true;
1571
1572 xip->subtags = XMLIndexSameSubTags(entry, xip, INSDREFERENCE);
1573 if(xip->subtags == NULL)
1574 {
1575 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1576 "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1577 XMLStringByTag(xmkwl, INSDSEQ_REFERENCES));
1578 return false;
1579 }
1580
1581 for(xipref = xip->subtags; xipref != NULL; xipref = xipref->next)
1582 {
1583 if(XMLIndexSubTags(entry, xipref, xmrefkwl) == false ||
1584 XMLCheckRequiredRefTags(xipref->subtags) == false)
1585 break;
1586
1587 reftagref = NULL;
1588 reftagpos = NULL;
1589 for(txip = xipref->subtags; txip != NULL; txip = txip->next)
1590 {
1591 if(txip->tag == INSDREFERENCE_REFERENCE)
1592 {
1593 if(reftagref != NULL)
1594 MemFree(reftagref);
1595 reftagref = XMLGetTagValue(entry, txip);
1596 continue;
1597 }
1598 if(txip->tag == INSDREFERENCE_POSITION)
1599 {
1600 if(reftagpos != NULL)
1601 MemFree(reftagpos);
1602 reftagpos = XMLGetTagValue(entry, txip);
1603 continue;
1604 }
1605 if(txip->tag == INSDREFERENCE_AUTHORS)
1606 {
1607 txip->subtags = XMLIndexSameSubTags(entry, txip, INSDAUTHOR);
1608 if(txip->subtags == NULL)
1609 break;
1610 }
1611 else if(txip->tag == INSDREFERENCE_XREF)
1612 {
1613 txip->subtags = XMLIndexSameSubTags(entry, txip, INSDXREF);
1614 if(txip->subtags == NULL)
1615 break;
1616 xipsub = txip->subtags;
1617 for(; xipsub != NULL; xipsub = xipsub->next)
1618 if(XMLIndexSubTags(entry, xipsub, xmxrefkwl) == false ||
1619 XMLCheckRequiredXrefTags(xipsub->subtags) == false)
1620 break;
1621 }
1622 }
1623
1624 if(reftagpos != NULL)
1625 {
1626 xipref->type = XMLGetRefTypePos(reftagpos, bases);
1627 MemFree(reftagpos);
1628 }
1629 else
1630 xipref->type = XMLGetRefType(reftagref, bases);
1631 if(reftagref != NULL)
1632 MemFree(reftagref);
1633
1634 if(txip != NULL)
1635 break;
1636 }
1637
1638 if(xipref == NULL)
1639 return true;
1640
1641 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1642 "Incorrectly formatted \"%s\" XML block. Entry dropped.",
1643 XMLStringByTag(xmkwl, INSDSEQ_REFERENCES));
1644 return false;
1645 }
1646
1647 /**********************************************************/
XMLIndex(ParserPtr pp)1648 bool XMLIndex(ParserPtr pp)
1649 {
1650 IndexblkPtr* ibpp;
1651 IndexblkPtr ibp;
1652 char* entry;
1653
1654 XMLPerformIndex(pp);
1655
1656 if(pp->indx == 0)
1657 return false;
1658
1659 pp->curindx = 0;
1660 for(ibpp = pp->entrylist; *ibpp != NULL; ibpp++, pp->curindx++)
1661 {
1662 ibp = *ibpp;
1663 if(ibp->len == 0)
1664 {
1665 ErrPostEx(SEV_ERROR, ERR_FORMAT_MissingEnd,
1666 "Missing end tag of XML record, which starts at line %d. Entry dropped.",
1667 ibp->linenum);
1668 ibp->drop = 1;
1669 continue;
1670 }
1671 entry = XMLLoadEntry(pp, true);
1672 if(entry == NULL)
1673 {
1674 ErrPostEx(SEV_FATAL, ERR_INPUT_CannotReadEntry,
1675 "Failed ro read entry from file, which starts at line %d. Entry dropped.",
1676 ibp->linenum);
1677 ibp->drop = 1;
1678 continue;
1679 }
1680
1681 XMLInitialEntry(ibp, entry, pp->accver, pp->source);
1682 if(ibp->drop != 0)
1683 {
1684 MemFree(entry);
1685 continue;
1686 }
1687 if(XMLTagCheck(ibp->xip, xmkwl) == false)
1688 {
1689 ErrPostEx(SEV_ERROR, ERR_FORMAT_XMLFormatError,
1690 "Incorrectly formatted XML record. Entry dropped.");
1691 ibp->drop = 1;
1692 MemFree(entry);
1693 continue;
1694 }
1695 if(XMLAccessionsCheck(pp, ibp, entry) == false)
1696 {
1697 MemFree(entry);
1698 continue;
1699 }
1700 XMLGetSegment(entry, ibp);
1701 if(XMLCheckRequiredTags(pp, ibp) == false)
1702 {
1703 ibp->drop = 1;
1704 MemFree(entry);
1705 continue;
1706 }
1707 if(XMLKeywordsCheck(entry, ibp, pp->source) == false)
1708 {
1709 MemFree(entry);
1710 continue;
1711 }
1712 if(XMLIndexFeatures(entry, ibp->xip) == false ||
1713 XMLIndexReferences(entry, ibp->xip, ibp->bases) == false)
1714 {
1715 ibp->drop = 1;
1716 MemFree(entry);
1717 continue;
1718 }
1719 MemFree(entry);
1720 }
1721
1722 for(pp->num_drop = 0, ibpp = pp->entrylist; *ibpp != NULL; ibpp++)
1723 if((*ibpp)->drop != 0)
1724 pp->num_drop++;
1725
1726 if(pp->indx > 0)
1727 return true;
1728 return false;
1729 }
1730
1731 /**********************************************************/
XMLBuildRefDataBlk(char * entry,XmlIndexPtr xip,Int2 type)1732 DataBlkPtr XMLBuildRefDataBlk(char* entry, XmlIndexPtr xip, Int2 type)
1733 {
1734 XmlIndexPtr txip;
1735 DataBlkPtr dbp;
1736 DataBlkPtr tdbp;
1737
1738 if(entry == NULL || xip == NULL)
1739 return(NULL);
1740
1741 while(xip != NULL && xip->tag != INSDSEQ_REFERENCES)
1742 xip = xip->next;
1743 if(xip == NULL || xip->subtags == NULL)
1744 return(NULL);
1745
1746 for(dbp = NULL, txip = xip->subtags; txip != NULL; txip = txip->next)
1747 {
1748 if(txip->type != type || txip->subtags == NULL)
1749 continue;
1750 if(dbp == NULL)
1751 {
1752 dbp = (DataBlkPtr) MemNew(sizeof(DataBlk));
1753 tdbp = dbp;
1754 }
1755 else
1756 {
1757 tdbp->next = (DataBlkPtr) MemNew(sizeof(DataBlk));
1758 tdbp = tdbp->next;
1759 }
1760 tdbp->type = txip->type;
1761 tdbp->offset = entry;
1762 tdbp->data = (void*) txip->subtags;
1763 tdbp->next = NULL;
1764 }
1765 return(dbp);
1766 }
1767
1768 /**********************************************************/
XMLGetKeywords(char * entry,XmlIndexPtr xip,TKeywordList & keywords)1769 void XMLGetKeywords(char* entry, XmlIndexPtr xip, TKeywordList& keywords)
1770 {
1771 XmlIndexPtr xipkwd;
1772 char* p;
1773
1774 keywords.clear();
1775 if(entry == NULL || xip == NULL)
1776 return;
1777
1778 for(; xip != NULL; xip = xip->next)
1779 if(xip->tag == INSDSEQ_KEYWORDS && xip->subtags != NULL)
1780 break;
1781 if(xip == NULL)
1782 return;
1783
1784 for(xipkwd = xip->subtags; xipkwd != NULL; xipkwd = xipkwd->next)
1785 {
1786 p = XMLGetTagValue(entry, xipkwd);
1787 if(p == NULL)
1788 continue;
1789
1790 keywords.push_back(p);
1791 MemFree(p);
1792 }
1793 }
1794
1795 /**********************************************************/
XMLConcatSubTags(char * entry,XmlIndexPtr xip,Int4 tag,Char sep)1796 char* XMLConcatSubTags(char* entry, XmlIndexPtr xip, Int4 tag, Char sep)
1797 {
1798 XmlIndexPtr txip;
1799 char* buf;
1800 char* p;
1801 char* q;
1802 size_t i;
1803
1804 if(entry == NULL || xip == NULL)
1805 return(NULL);
1806
1807 while(xip != NULL && xip->tag != tag)
1808 xip = xip->next;
1809
1810 if(xip == NULL || xip->subtags == NULL)
1811 return(NULL);
1812
1813 for(i = 0, txip = xip->subtags; txip != NULL; txip = txip->next)
1814 i += (txip->end - txip->start + 2);
1815
1816 buf = (char*) MemNew(i);
1817 buf[0] = '\0';
1818 for(q = buf, txip = xip->subtags; txip != NULL; txip = txip->next)
1819 {
1820 if(txip->end <= txip->start)
1821 continue;
1822 if(buf[0] != '\0')
1823 {
1824 *q++ = sep;
1825 *q++ = ' ';
1826 }
1827 for(i = txip->start, p = entry + txip->start; i < txip->end; i++)
1828 *q++ = *p++;
1829 *q = '\0';
1830 }
1831 XMLRestoreSpecialCharacters(buf);
1832 return(buf);
1833 }
1834
1835 END_NCBI_SCOPE
1836