1 /* accutils.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * RCS $Id: accutils.c,v 6.15 1999/04/01 13:56:34 sicotte Exp $
27 *
28 * Author: J. Epstein
29 *
30 * Version Creation Date: 10/18/93
31 *
32 * File Description:
33 * Utilities which make use of the Entrez "data access library"
34 *
35 * Modifications:
36 * --------------------------------------------------------------------------
37 * $Log: accutils.c,v $
38 * Revision 6.15 1999/04/01 13:56:34 sicotte
39 * Moved WHICH_db_accession,IS_ntdb_accession,IS_protdb_accession to
40 * sequtil.ch
41 * Removed old static functions of Colombe (that are public in salutil.c)
42 * The only code left in accutils.c is the Entrez Access code.
43 *
44 * Revision 6.14 1999/03/31 21:13:47 sicotte
45 * Add info on N-accessions
46 *
47 * Revision 6.13 1999/03/31 13:34:18 sicotte
48 * in WHICH_ntdb_accession, swapped (C** and B** prot accession for embl/ddbj)
49 *
50 * Revision 6.12 1999/03/18 20:24:05 sicotte
51 * changed the define name for NC_ accession
52 *
53 * Revision 6.11 1999/03/18 20:18:18 sicotte
54 * added REFSEQ accession numbers and macro ACCN_IS_GENBANK()
55 *
56 * Revision 6.10 1999/03/18 15:34:04 sicotte
57 * Updated Accession List and added protein Accessions
58 * for functions IS_ntdb_accession and IS_protdb_accession.
59 * New function WHICH_db_accession with return code allowing
60 * to figure out the molecule type and the database from macros
61 * in accutils.h
62 *
63 * Revision 6.9 1999/02/24 16:48:09 kans
64 * added IS_ntdb_accession and IS_protdb_accession, removed NormalizeSeqAlignId
65 *
66 * Revision 6.8 1999/01/27 16:20:51 chappey
67 * update IS_ntdb_accession with AB, AJ
68 *
69 * Revision 6.7 1999/01/06 14:18:36 grisha
70 * add defines to switch ID0/ID1 usage
71 *
72 * Revision 6.6 1998/06/12 19:19:10 kans
73 * fixed unix compiler warnings
74 *
75 * Revision 6.5 1998/04/28 19:29:13 shavirin
76 * Fixed minor purify detected bug.
77 *
78 * Revision 6.4 1998/02/11 19:50:00 kans
79 * FastaSeqPort takes code parameter
80 *
81 * Revision 6.3 1997/11/14 22:13:50 vakatov
82 * [WIN32,DLL] Added NLM_EXTERN's
83 *
84 * Revision 6.2 1997/09/12 15:28:26 chappey
85 * Revision changes in NormalizeSeqAlign
86 *
87 * Revision 6.1 1997/09/04 14:14:50 chappey
88 * Revision changes in NormalizeSeqAlign
89 *
90 * Revision 5.15 1997/08/07 16:02:40 kans
91 * Revision added NormalizeSeqAlignId (Colombe)
92 *
93 * Revision 5.14 1997/06/26 21:55:17 vakatov
94 * Revision [PC] DLL'd "ncbicdr.lib", "ncbiacc.lib", "ncbinacc.lib" and "ncbicacc.lib"
95 *
96 * Revision 5.13 1997/05/14 14:27:57 shavirin
97 * Revision Function AccessionToFasta adopted for protein accessions
98 *
99 * Revision 5.11 1997/05/13 21:11:05 shavirin
100 * Revision Changed function AccessionToFasta() to use PubMed accession indexes
101 *
102 * Revision 5.10 1997/02/04 18:58:01 epstein
103 * Revision add GetFullEntrezTermList() function
104 *
105 * Revision 5.9 1997/01/13 15:13:34 brandon
106 * changed EntrezStringToField
107 *
108 * Revision 5.8 1997/01/07 17:34:33 epstein
109 * eliminate PMENTREZ by default
110 *
111 * Revision 5.7 1996/10/01 18:19:03 shavirin
112 * Removed unused variable and fixed memory problems in AccessionToFasta()
113 *
114 * Revision 5.6 1996/09/26 17:52:45 ostell
115 * made AccessionToFasta LIBCALL
116 *
117 * Revision 5.5 1996/09/23 21:22:09 shavirin
118 * Added new function AccessionToFasta(), returning Fasta entry for
119 * given Accession or GI as a string
120 *
121 * Revision 5.2 1996/08/14 15:15:05 brandon
122 * added date parameter to tleval functions
123 *
124 * Revision 5.1 1996/07/01 14:06:35 epstein
125 * add 'join function' EntrezCommonHierAncestor()
126 *
127 * Revision 4.6 1996/05/21 17:37:31 epstein
128 * eliminate case-sensitivity
129 *
130 * Revision 4.5 1996/05/14 21:01:40 epstein
131 * remove references to FLD_MLOC
132 *
133 * Revision 4.4 1996/04/23 19:14:46 epstein
134 * memory-leak/acces cleanup, per D. Vakatov
135 *
136 * Revision 4.3 1995/10/11 13:40:37 epstein
137 * make EntrezStringToField() and EntrezFieldToString() data-driven
138 *
139 * Revision 4.2 1995/10/02 02:36:00 epstein
140 * add range-checking
141 *
142 * Revision 4.1 1995/08/24 20:44:21 epstein
143 * add more stuff for genomes
144 * ==========================================================================
145 */
146
147 #define REVISION_STR "$Revision: 6.15 $"
148
149 #include <accutils.h>
150 #ifndef _CDROMLIB_
151 #include <cdromlib.h>
152 #endif
153
154 #define ERRPOST_LVL(x) ((x) == NULL ? SEV_INFO : SEV_ERROR)
155
156 #define LEXCHAR_LPAREN 1
157 #define LEXCHAR_RPAREN 2
158 #define LEXCHAR_LBRACKET 3
159 #define LEXCHAR_RBRACKET 4
160 #define LEXCHAR_QUOTE 5
161 #define LEXCHAR_AND 6
162 #define LEXCHAR_OR 7
163 #define LEXCHAR_NOT 8
164 #define LEXCHAR_COMMA 9
165 #define LEXCHAR_ATSIGN 10
166 #define LEXCHAR_BACKSLASH 11
167 #define LEXCHAR_WHITESPACE 12
168 #define LEXCHAR_SEMICOLON 13
169 #define LEXCHAR_COLON 14
170 #define LEXCHAR_EOL 15
171 #define LEXCHAR_NULL 16
172 #define LEXCHAR_OTHER 17
173
174 #define LEXSTATE_IDLE 0
175 #define LEXSTATE_BACKSLASHED 1
176 #define LEXSTATE_INQUOTE 2
177 #define LEXSTATE_INQUOTE_AFTERBSLASH 3
178 #define LEXSTATE_INSTRING 4
179 #define LEXSTATE_ERROR 5
180
181 #define LEXTOK_LPAREN 1
182 #define LEXTOK_RPAREN 2
183 #define LEXTOK_LBRACKET 3
184 #define LEXTOK_RBRACKET 4
185 #define LEXTOK_AND 5
186 #define LEXTOK_OR 6
187 #define LEXTOK_NOT 7
188 #define LEXTOK_COMMA 8
189 #define LEXTOK_ATSIGN 9
190 #define LEXTOK_STRING 10
191 #define LEXTOK_RANGE 11
192
193 static Int2 lexPosition = 0;
194 static CharPtr lexString = NULL;
195 static Int2 lexState = LEXSTATE_IDLE;
196
197 static ValNodePtr nextnode = NULL;
198 static ValNode nextRealNode;
199 static Int2 lastGood = -1;
200 static Int2 lastBad = -1;
201
LexClassifyChar(Char c)202 static Int2 LexClassifyChar(Char c)
203 {
204 Int2 retval;
205
206 switch(c) {
207 case '(': retval = LEXCHAR_LPAREN; break;
208 case ')': retval = LEXCHAR_RPAREN; break;
209 case '[': retval = LEXCHAR_LBRACKET; break;
210 case ']': retval = LEXCHAR_RBRACKET; break;
211 case '"': retval = LEXCHAR_QUOTE; break;
212 case '&': retval = LEXCHAR_AND; break;
213 case '|': retval = LEXCHAR_OR; break;
214 case '-': retval = LEXCHAR_NOT; break;
215 case ',': retval = LEXCHAR_COMMA; break;
216 case '@': retval = LEXCHAR_ATSIGN; break;
217 case '\\': retval = LEXCHAR_BACKSLASH; break;
218 case ' ':
219 case '\t': retval = LEXCHAR_WHITESPACE; break;
220 case ';': retval = LEXCHAR_SEMICOLON; break;
221 case ':': retval = LEXCHAR_COLON; break;
222 case '\0': retval = LEXCHAR_NULL; break;
223 case '\r':
224 case '\n': retval = LEXCHAR_EOL; break;
225 default: retval = LEXCHAR_OTHER; break;
226 }
227
228 return retval;
229 }
230
231 /* Returns -1 if no token available, else the position of the token */
EntrezLexExpression(CharPtr str,ValNodePtr vnp)232 static Int2 EntrezLexExpression(CharPtr str, ValNodePtr vnp)
233 {
234 Int2 startPos;
235 Int2 classChar;
236 Int2 token = 0;
237 Boolean done;
238 Char c;
239 CharPtr lexToken = NULL;
240 CharPtr lexTokenStart;
241 Int2 len;
242
243 if (str == NULL && lexString == NULL)
244 return -1;
245 if (str != NULL)
246 {
247 MemFree(lexString);
248 lexString = StringSave(str);
249 lexPosition = 0;
250 lexState = LEXSTATE_IDLE;
251 }
252 if (vnp == NULL)
253 return -1;
254
255 len = StringLen(lexString);
256 startPos = lexPosition;
257
258 if (lexPosition >= len)
259 {
260 lexState = LEXSTATE_ERROR;
261 token = -1;
262 lexToken = MemNew(1);
263 } else {
264 lexToken = MemNew(StringLen(&lexString[lexPosition]) + 1);
265 }
266 lexTokenStart = lexToken;
267
268 for (done = FALSE; ! done && lexPosition <= len; lexPosition++)
269 {
270 c = lexString[lexPosition];
271 classChar = LexClassifyChar(c);
272 switch (lexState) {
273 case LEXSTATE_IDLE:
274 switch (classChar) {
275 case LEXCHAR_LPAREN:
276 token = LEXTOK_LPAREN; done = TRUE; break;
277 case LEXCHAR_RPAREN:
278 token = LEXTOK_RPAREN; done = TRUE; break;
279 case LEXCHAR_LBRACKET:
280 token = LEXTOK_LBRACKET; done = TRUE; break;
281 case LEXCHAR_RBRACKET:
282 token = LEXTOK_RBRACKET; done = TRUE; break;
283 case LEXCHAR_AND:
284 token = LEXTOK_AND; done = TRUE; break;
285 case LEXCHAR_OR:
286 token = LEXTOK_OR; done = TRUE; break;
287 case LEXCHAR_NOT:
288 token = LEXTOK_NOT; done = TRUE; break;
289 case LEXCHAR_COMMA:
290 token = LEXTOK_COMMA; done = TRUE; break;
291 case LEXCHAR_ATSIGN:
292 token = LEXTOK_ATSIGN; done = TRUE; break;
293 case LEXCHAR_COLON:
294 token = LEXTOK_RANGE; done = TRUE; break;
295 case LEXCHAR_QUOTE:
296 lexState = LEXSTATE_INQUOTE; break;
297 case LEXCHAR_BACKSLASH:
298 lexState = LEXSTATE_BACKSLASHED; break;
299 case LEXCHAR_EOL:
300 case LEXCHAR_WHITESPACE:
301 startPos = lexPosition + 1; break;
302 case LEXCHAR_SEMICOLON:
303 case LEXCHAR_NULL:
304 lexState = LEXSTATE_ERROR; done = TRUE; break;
305 case LEXCHAR_OTHER:
306 default:
307 lexState = LEXSTATE_INSTRING; *lexToken++ = c; break;
308 }
309 break;
310 case LEXSTATE_BACKSLASHED:
311 switch (classChar) {
312 case LEXCHAR_NULL:
313 case LEXCHAR_EOL:
314 *lexToken++ = '\0'; done = TRUE; lexState = LEXSTATE_IDLE; break;
315 case LEXCHAR_LPAREN:
316 case LEXCHAR_RPAREN:
317 case LEXCHAR_LBRACKET:
318 case LEXCHAR_RBRACKET:
319 case LEXCHAR_QUOTE:
320 case LEXCHAR_AND:
321 case LEXCHAR_OR:
322 case LEXCHAR_NOT:
323 case LEXCHAR_COMMA:
324 case LEXCHAR_ATSIGN:
325 case LEXCHAR_BACKSLASH:
326 case LEXCHAR_WHITESPACE:
327 case LEXCHAR_SEMICOLON:
328 case LEXCHAR_COLON:
329 case LEXCHAR_OTHER:
330 default:
331 lexState = LEXSTATE_INSTRING; *lexToken++ = c; break;
332 }
333 break;
334 case LEXSTATE_INQUOTE:
335 switch (classChar) {
336 case LEXCHAR_QUOTE:
337 token = LEXTOK_STRING;
338 *lexToken++ = '\0';
339 done = TRUE;
340 lexState = LEXSTATE_IDLE;
341 break;
342 case LEXCHAR_BACKSLASH:
343 lexState = LEXSTATE_INQUOTE_AFTERBSLASH; break;
344 case LEXCHAR_NULL:
345 case LEXCHAR_EOL:
346 lexState = LEXSTATE_ERROR; done = TRUE; break;
347 default:
348 *lexToken++ = c; break;
349 }
350 break;
351 case LEXSTATE_INQUOTE_AFTERBSLASH:
352 switch (classChar) {
353 case LEXCHAR_NULL:
354 case LEXCHAR_EOL:
355 lexState = LEXSTATE_ERROR; done = TRUE; break;
356 default:
357 lexState = LEXSTATE_INQUOTE; *lexToken++ = c; break;
358 }
359 break;
360 case LEXSTATE_INSTRING:
361 switch (classChar) {
362 case LEXCHAR_WHITESPACE:
363 case LEXCHAR_SEMICOLON:
364 case LEXCHAR_NULL:
365 case LEXCHAR_EOL:
366 token = LEXTOK_STRING;
367 *lexToken++ = '\0';
368 done = TRUE;
369 lexState = LEXSTATE_IDLE;
370 break;
371 case LEXCHAR_BACKSLASH:
372 lexState = LEXSTATE_BACKSLASHED;
373 break;
374 case LEXCHAR_QUOTE:
375 lexState = LEXSTATE_INQUOTE;
376 break;
377 case LEXCHAR_OTHER:
378 *lexToken++ = c; break;
379 default:
380 token = LEXTOK_STRING;
381 *lexToken++ = '\0';
382 done = TRUE;
383 lexState = LEXSTATE_IDLE;
384 lexPosition--; /* push back the last character */
385 break;
386 }
387 break;
388 case LEXSTATE_ERROR:
389 done = TRUE;
390 break;
391 }
392 }
393
394 vnp->choice = (Uint1) token;
395 vnp->data.ptrvalue = NULL;
396 if (token == LEXTOK_STRING)
397 {
398 vnp->data.ptrvalue = lexTokenStart;
399 } else {
400 MemFree(lexTokenStart);
401 }
402 if (lexState == LEXSTATE_ERROR)
403 return -1;
404 else
405 return startPos;
406 }
407
StrNextNode(void)408 static void StrNextNode(void)
409 {
410 nextnode = &nextRealNode;
411
412 if (EntrezLexExpression(NULL, nextnode) < 0)
413 {
414 nextnode = NULL;
415 }
416
417 lastGood = lastBad;
418 lastBad = lexPosition;
419 }
420
421 static Boolean StrExpression PROTO((ValNodePtr elst, DocType db, DocField fld));
422
FindTermRequired(CharPtr term)423 static Boolean FindTermRequired(CharPtr term)
424 {
425 Int2 len;
426
427 if (term == NULL)
428 return FALSE;
429 len = StrLen(term);
430 return ((len > 3 && term[len-1] == '.' && term[len-2] == '.' &&
431 term[len-3] == '.') || (len > 1 && term[len-1] == '*') ||
432 StrChr(term, '?') != NULL);
433 }
434
435 /* traverse all the possible fields, and take the "union" of this term */
436 /* over all the possibilities for the specified database */
AddAllFields(ValNodePtr elst,CharPtr term,DocType db,Boolean special)437 static Boolean AddAllFields(ValNodePtr elst, CharPtr term, DocType db, Boolean special)
438 {
439 EntrezInfoPtr eip;
440 Boolean first = TRUE;
441 Int4 specialCount;
442 Int4 totalCount;
443 EntrezFieldDataPtr fields;
444 DocField fld;
445 Boolean findTermRequired;
446
447 if ((eip = EntrezGetInfo()) == NULL || elst == NULL)
448 return FALSE;
449 if (db < 0 || db >= eip->type_count)
450 return FALSE;
451 findTermRequired = FindTermRequired(term);
452 fields = eip->types[db].fields;
453 for (fld = 0; fld < eip->field_count; fld++)
454 {
455 if (fields == NULL || fields[fld].num_terms <= 0 || fld ==
456 FLD_ORGN_HIER || (db == TYP_ML && fld == FLD_PROT))
457 continue;
458 if (first)
459 {
460 EntrezTLAddLParen(elst);
461 first = FALSE;
462 } else {
463 EntrezTLAddOR(elst);
464 }
465 if (findTermRequired)
466 {
467 EntrezFindTerm(db, fld, term, &specialCount, &totalCount);
468 }
469 EntrezTLAddTerm(elst, term, db, fld, special);
470 }
471
472 if (! first)
473 {
474 EntrezTLAddRParen(elst);
475 }
476
477 return TRUE;
478 }
479
480 static Boolean
StrFactor(ValNodePtr elst,DocType db,DocField fld)481 StrFactor(ValNodePtr elst, DocType db, DocField fld)
482 {
483 if (nextnode == NULL)
484 {
485 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "Null factor");
486 return FALSE;
487 }
488 if (nextnode->choice == LEXTOK_LPAREN)
489 {
490 if (elst != NULL)
491 EntrezTLAddLParen (elst);
492 StrNextNode();
493 if (! StrExpression(elst, db, fld))
494 {
495 return FALSE;
496 }
497 if (nextnode != NULL && nextnode->choice == LEXTOK_RPAREN)
498 {
499 if (elst != NULL)
500 EntrezTLAddRParen (elst);
501 StrNextNode();
502 } else {
503 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "Missing right paren");
504 return FALSE;
505 }
506 } else {
507 if (nextnode->choice == LEXTOK_STRING)
508 {
509 CharPtr term;
510 Boolean special = FALSE;
511 CharPtr fldStr;
512 CharPtr highRangeStr = NULL;
513 Boolean allFields = FALSE;
514
515 term = (CharPtr) (nextnode->data.ptrvalue);
516 StrNextNode();
517 if (nextnode != NULL && nextnode->choice == LEXTOK_RANGE)
518 {
519 StrNextNode();
520 if (nextnode == NULL || nextnode->choice != LEXTOK_STRING)
521 {
522 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing second half of range");
523 MemFree(term);
524 return FALSE;
525 }
526 highRangeStr = (CharPtr) (nextnode->data.ptrvalue);
527 StrNextNode();
528 }
529 if (nextnode != NULL && nextnode->choice == LEXTOK_LBRACKET)
530 {
531 StrNextNode();
532 if (nextnode == NULL || nextnode->choice != LEXTOK_STRING)
533 {
534 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing field id after bracket");
535 MemFree(term);
536 MemFree(highRangeStr);
537 return FALSE;
538 }
539 fldStr = (CharPtr) (nextnode->data.ptrvalue);
540 if (fldStr != NULL && StrCmp(fldStr, "*") == 0)
541 {
542 allFields = TRUE;
543 } else {
544 fld = EntrezStringToField(db, fldStr);
545 }
546 MemFree(nextnode->data.ptrvalue);
547 if (!allFields && fld < 0)
548 {
549 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "bad field identifier");
550 MemFree(term);
551 MemFree(highRangeStr);
552 return FALSE;
553 }
554 StrNextNode();
555 if (nextnode == NULL || (nextnode->choice != LEXTOK_COMMA &&
556 nextnode->choice != LEXTOK_RBRACKET))
557 {
558 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing right bracket");
559 MemFree(term);
560 MemFree(highRangeStr);
561 return FALSE;
562 }
563 if (nextnode->choice == LEXTOK_COMMA)
564 {
565 StrNextNode();
566 if (nextnode == NULL || nextnode->choice != LEXTOK_STRING ||
567 StringCmp(nextnode->data.ptrvalue, "S") != 0)
568 {
569 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "field qualifier error");
570 MemFree(term);
571 MemFree(highRangeStr);
572 return FALSE;
573 }
574 MemFree(nextnode->data.ptrvalue);
575 special = TRUE;
576 StrNextNode();
577 if (nextnode == NULL || nextnode->choice != LEXTOK_RBRACKET)
578 {
579 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing right bracket");
580 MemFree(term);
581 MemFree(highRangeStr);
582 return FALSE;
583 }
584 }
585 StrNextNode();
586 }
587
588 /* if ( the default specified by caller is -1 ==> all ) then */
589 if (fld < 0)
590 {
591 allFields = TRUE;
592 if (highRangeStr != NULL)
593 {
594 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "ranges require explicit field specification");
595 MemFree(term);
596 MemFree(highRangeStr);
597 return FALSE;
598 }
599 }
600
601 if (elst != NULL)
602 {
603 if (allFields)
604 {
605 AddAllFields(elst, term, db, special);
606 } else {
607 Int4 specialCount;
608 Int4 totalCount;
609
610 if (FindTermRequired(term))
611 {
612 EntrezFindTerm(db, fld, term, &specialCount, &totalCount);
613 }
614 EntrezTLAddTermWithRange(elst, term, db, fld, special, highRangeStr);
615 }
616 }
617 MemFree (term);
618 MemFree(highRangeStr);
619 } else {
620 ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "invalid token");
621 return FALSE;
622 }
623 }
624
625 return TRUE;
626 }
627
628
629 static Boolean
StrTerm(ValNodePtr elst,DocType db,DocField fld)630 StrTerm(ValNodePtr elst, DocType db, DocField fld)
631 {
632 if (! StrFactor(elst, db, fld))
633 return FALSE;
634 while (nextnode != NULL && nextnode->choice == LEXTOK_AND)
635 {
636 if (elst != NULL)
637 EntrezTLAddAND(elst);
638 StrNextNode();
639 if (! StrFactor(elst, db, fld))
640 return FALSE;
641 }
642
643 return TRUE;
644 }
645
646 static Boolean
StrDiff(ValNodePtr elst,DocType db,DocField fld)647 StrDiff(ValNodePtr elst, DocType db, DocField fld)
648 {
649 if (! StrTerm(elst, db, fld))
650 return FALSE;
651 while (nextnode != NULL && nextnode->choice == LEXTOK_OR)
652 {
653 if (elst != NULL)
654 EntrezTLAddOR(elst);
655 StrNextNode();
656 if (! StrTerm(elst, db, fld))
657 return FALSE;
658 }
659
660 return TRUE;
661 }
662
663 static Boolean
StrExpression(ValNodePtr elst,DocType db,DocField fld)664 StrExpression(ValNodePtr elst, DocType db, DocField fld)
665 {
666 if (! StrDiff(elst, db, fld))
667 return FALSE;
668 while (nextnode != NULL && nextnode->choice == LEXTOK_NOT)
669 {
670 if (elst != NULL)
671 EntrezTLAddBUTNOT(elst);
672 StrNextNode();
673 if (! StrDiff(elst, db, fld))
674 return FALSE;
675 }
676
677 return TRUE;
678 }
679
EntrezFieldToString(DocType db,DocField fld)680 NLM_EXTERN CharPtr LIBCALL EntrezFieldToString(DocType db, DocField fld)
681 {
682 CharPtr fldStr;
683 static Char str[6];
684
685 if (EntrezIsInited())
686 {
687 EntrezInfoPtr eip;
688
689 if ((eip = EntrezGetInfo()) != NULL && fld < eip->field_count)
690 {
691 StrNCpy(str, eip->field_info[fld].tag, sizeof(str) - 1);
692 StringUpper(str);
693 return str;
694 }
695 }
696
697 switch(fld)
698 {
699 case FLD_WORD:
700 fldStr = "WORD"; break;
701 case FLD_MESH:
702 fldStr = "MESH"; break;
703 case FLD_AUTH:
704 fldStr = "AUTH"; break;
705 case FLD_JOUR:
706 fldStr = "JOUR"; break;
707 case FLD_GENE:
708 fldStr = "GENE"; break;
709 case FLD_KYWD:
710 fldStr = "KYWD"; break;
711 case FLD_ECNO:
712 fldStr = "ECNO"; break;
713 case FLD_ORGN:
714 fldStr = "ORGN"; break;
715 case FLD_ACCN:
716 fldStr = "ACCN"; break;
717 case FLD_PROT:
718 fldStr = "PROT"; break;
719 case FLD_ORGN_HIER:
720 fldStr = "HIER"; break;
721 case FLD_DATE:
722 fldStr = "DATE"; break;
723 case FLD_FKEY:
724 fldStr = "FKEY"; break;
725 case FLD_PROP:
726 fldStr = "PROP"; break;
727 case FLD_SUBS:
728 fldStr = "SUBS"; break;
729 default:
730 fldStr = "????";
731 }
732
733 return StringSave(fldStr);
734 }
735
EntrezStringToField(DocType db,CharPtr str)736 NLM_EXTERN DocField LIBCALL EntrezStringToField(DocType db, CharPtr str)
737 {
738 if (str == NULL)
739 return -1;
740
741 #ifdef _PMENTREZ_
742 return PMEntrezStringToField(db,str);
743 #else
744
745 if (EntrezIsInited())
746 {
747 EntrezInfoPtr eip;
748 DocField fld;
749
750 if ((eip = EntrezGetInfo()) != NULL)
751 {
752 for (fld = 0; fld < eip->field_count; fld++)
753 {
754 if (StringICmp(str, eip->field_info[fld].tag) == 0)
755 return fld;
756 }
757 }
758
759
760
761 }
762
763 if (StringICmp(str, "WORD") == 0)
764 return FLD_WORD;
765 if (StringICmp(str, "MESH") == 0)
766 return FLD_MESH;
767 if (StringICmp(str, "AUTH") == 0)
768 return FLD_AUTH;
769 if (StringICmp(str, "JOUR") == 0)
770 return FLD_JOUR;
771 if (StringICmp(str, "GENE") == 0)
772 return FLD_GENE;
773 if (StringICmp(str, "KYWD") == 0)
774 return FLD_KYWD;
775 if (StringICmp(str, "ECNO") == 0)
776 return FLD_ECNO;
777 if (StringICmp(str, "ORGN") == 0)
778 return FLD_ORGN;
779 if (StringICmp(str, "ACCN") == 0)
780 return FLD_ACCN;
781 if (StringICmp(str, "PROT") == 0)
782 return FLD_PROT;
783 if (StringICmp(str, "HIER") == 0)
784 return FLD_ORGN_HIER;
785 if (StringICmp(str, "DATE") == 0)
786 return FLD_DATE;
787 if (StringICmp(str, "FKEY") == 0)
788 return FLD_FKEY;
789 if (StringICmp(str, "PROP") == 0)
790 return FLD_PROP;
791 if (StringICmp(str, "SUBS") == 0)
792 return FLD_SUBS;
793 return -1;
794 #endif
795 }
796
797
EntrezPMTLEvalString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)798 NLM_EXTERN LinkSetPtr LIBCALL EntrezPMTLEvalString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
799 {
800 LinkSetPtr lsp;
801 ValNodePtr elst;
802
803 if (begin != NULL)
804 {
805 *begin = -1;
806 }
807 if (end != NULL)
808 {
809 *end = -1;
810 }
811
812 if (str == NULL || *str == '\0')
813 {
814 return NULL;
815 }
816
817 if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
818 {
819 return NULL;
820 }
821
822 if ((elst = EntrezTLNew(db)) == NULL)
823 {
824 return NULL;
825 }
826
827 EntrezLexExpression(str, NULL);
828 StrNextNode();
829
830 lsp = NULL;
831 if (StrExpression(elst, db, fld) && nextnode == NULL)
832 {
833 #ifdef _PMENTREZ_
834 lsp = EntrezPMTLEval(elst,edc);
835 #else
836 lsp = EntrezTLEval(elst);
837 #endif /* _PMENTREZ_ */
838 }
839 EntrezTLFree(elst);
840
841 if (lastGood < lastBad)
842 {
843 lastGood = lastBad;
844 }
845 if (begin != NULL)
846 {
847 *begin = lastGood;
848 }
849 if (end != NULL)
850 {
851 *end = lastBad;
852 }
853
854 return lsp;
855 }
856
EntrezTLEvalString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)857 NLM_EXTERN LinkSetPtr LIBCALL EntrezTLEvalString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
858 {
859 return EntrezPMTLEvalString(str,db,fld,begin,end,NULL);
860 }
861
EntrezPMTLEvalXString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)862 NLM_EXTERN ByteStorePtr LIBCALL EntrezPMTLEvalXString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
863 {
864 ByteStorePtr bsp;
865 ValNodePtr elst;
866
867 if (begin != NULL)
868 {
869 *begin = -1;
870 }
871 if (end != NULL)
872 {
873 *end = -1;
874 }
875
876 if (str == NULL || *str == '\0')
877 {
878 return NULL;
879 }
880
881 if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
882 {
883 return NULL;
884 }
885
886 if ((elst = EntrezTLNew(db)) == NULL)
887 {
888 return NULL;
889 }
890
891 EntrezLexExpression(str, NULL);
892 StrNextNode();
893
894 bsp = NULL;
895 if (StrExpression(elst, db, fld) && nextnode == NULL)
896 {
897 #ifdef _PMENTREZ_
898 bsp = EntrezPMTLEvalX(elst,edc);
899 #else
900 bsp = EntrezTLEvalX(elst);
901 #endif
902 }
903
904 EntrezTLFree(elst);
905
906 if (lastGood < lastBad)
907 {
908 lastGood = lastBad;
909 }
910 if (begin != NULL)
911 {
912 *begin = lastGood;
913 }
914 if (end != NULL)
915 {
916 *end = lastBad;
917 }
918
919 return bsp;
920 }
921
EntrezTLEvalXString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)922 NLM_EXTERN ByteStorePtr LIBCALL EntrezTLEvalXString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
923 {
924 return EntrezPMTLEvalXString(str,db,fld,begin,end,NULL);
925 }
926
EntrezPMTLEvalCountString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)927 NLM_EXTERN Int4 LIBCALL EntrezPMTLEvalCountString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
928 {
929 Int4 count;
930 ValNodePtr elst;
931
932 if (begin != NULL)
933 {
934 *begin = -1;
935 }
936 if (end != NULL)
937 {
938 *end = -1;
939 }
940
941 if (str == NULL || *str == '\0')
942 {
943 return 0;
944 }
945
946 if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
947 {
948 return 0;
949 }
950
951 if ((elst = EntrezTLNew(db)) == NULL)
952 {
953 return 0;
954 }
955
956 EntrezLexExpression(str, NULL);
957 StrNextNode();
958
959 count = 0;
960 if (StrExpression(elst, db, fld) && nextnode == NULL)
961 {
962 #ifdef _PMENTREZ_
963 count = EntrezPMTLEvalCount(elst,edc);
964 #else
965 count = EntrezTLEvalCount(elst);
966 #endif /* _PMENTREZ_ */
967 }
968 EntrezTLFree(elst);
969
970 if (lastGood < lastBad)
971 {
972 lastGood = lastBad;
973 }
974 if (begin != NULL)
975 {
976 *begin = lastGood;
977 }
978 if (end != NULL)
979 {
980 *end = lastBad;
981 }
982
983 return count;
984 }
985
EntrezTLEvalCountString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)986 NLM_EXTERN Int4 LIBCALL EntrezTLEvalCountString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
987 {
988 return EntrezPMTLEvalCountString(str,db,fld,begin,end,NULL);
989 }
990
991
EntrezPMTLParseString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)992 NLM_EXTERN Boolean LIBCALL EntrezPMTLParseString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
993 {
994 Boolean retval;
995
996 if (begin != NULL)
997 {
998 *begin = -1;
999 }
1000 if (end != NULL)
1001 {
1002 *end = -1;
1003 }
1004
1005 if (str == NULL || *str == '\0')
1006 {
1007 return FALSE;
1008 }
1009
1010 if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
1011 {
1012 return FALSE;
1013 }
1014 EntrezLexExpression(str, NULL);
1015 StrNextNode();
1016
1017 retval = StrExpression(NULL, db, fld) && nextnode == NULL;
1018
1019 if (lastGood < lastBad)
1020 {
1021 lastGood = lastBad;
1022 }
1023 if (begin != NULL)
1024 {
1025 *begin = lastGood;
1026 }
1027 if (end != NULL)
1028 {
1029 *end = lastBad;
1030 }
1031
1032 return retval;
1033 }
1034
EntrezTLParseString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)1035 NLM_EXTERN Boolean LIBCALL EntrezTLParseString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
1036 {
1037 return EntrezPMTLParseString(str,db,fld,begin,end,NULL);
1038 }
1039
1040
EntrezCommonHierAncestor(EntrezHierarchyPtr ehp1,EntrezHierarchyPtr ehp2,Int2Ptr distance1,Int2Ptr distance2)1041 NLM_EXTERN CharPtr LIBCALL EntrezCommonHierAncestor(EntrezHierarchyPtr ehp1, EntrezHierarchyPtr ehp2, Int2Ptr distance1, Int2Ptr distance2)
1042 {
1043 Int2 count;
1044 CharPtr retval;
1045
1046 if (ehp1 == NULL || ehp2 == NULL || ehp1->lineage == NULL || ehp2->lineage == NULL)
1047 return NULL;
1048
1049 for (count = 0; count < ehp1->numInLineage && count < ehp2->numInLineage;
1050 count++) {
1051 if (StrCmp(ehp1->lineage[count], ehp2->lineage[count]) != 0)
1052 break;
1053 }
1054
1055 if (count <= 0)
1056 return NULL;
1057 count--;
1058 retval = StringSave(ehp1->lineage[count]);
1059
1060 if (distance1 != NULL)
1061 *distance1 = ehp1->numInLineage - count;
1062 if (distance2 != NULL)
1063 *distance2 = ehp2->numInLineage - count;
1064
1065 return retval;
1066 }
1067
1068 /*****************************************************************************
1069 *
1070 * Function: AccessionToFasta
1071 *
1072 * Description: Returns Fasta entry for given Accession or GI number
1073 *
1074 *****************************************************************************/
1075 #define ATF_INIT_BUFF_SIZE 256
1076
AccessionToFasta(CharPtr string)1077 NLM_EXTERN FastaSeqPtr LIBCALL AccessionToFasta(CharPtr string)
1078 {
1079 Int4 gi=0;
1080 SeqEntryPtr sep;
1081 BioseqPtr bsp = NULL;
1082 SeqPortPtr spp;
1083 FastaSeqPtr fseq;
1084 Char buff[512];
1085 Int4 SequenceLen = 0;
1086 Uint1 code;
1087 Boolean is_na;
1088
1089 CharPtr str;
1090 ByteStorePtr bstore;
1091 Int4 GiNum;
1092
1093 if(string == NULL)
1094 return NULL;
1095
1096 if((fseq = MemNew(sizeof(FastaSeq))) == NULL)
1097 return NULL;
1098
1099 fseq->label = NULL;
1100 fseq->seq = NULL;
1101
1102 if((gi = atol(string)) > 0) {
1103 fseq->gi = gi;
1104 } else {
1105
1106 str = (CharPtr) MemNew(StringLen(string)+30);
1107 sprintf(str, "\"%s\"[ACCN]", string);
1108
1109 if((bstore = EntrezTLEvalXString(str, TYP_NT,
1110 -1, NULL, NULL)) == NULL ||
1111 (GiNum = BSLen(bstore)/sizeof(DocUid)) != 1) {
1112
1113 /* Try protein accessions */
1114
1115 if((bstore = EntrezTLEvalXString(str, TYP_AA,
1116 -1, NULL, NULL)) == NULL ||
1117 (GiNum = BSLen(bstore)/sizeof(DocUid)) != 1) {
1118
1119 MemFree(fseq);
1120 MemFree(str);
1121 return NULL;
1122 }
1123 }
1124
1125 MemFree(str);
1126
1127 BSSeek(bstore, 0L, 0);
1128
1129 BSRead(bstore, &gi, sizeof(Int4));
1130 BSFree(bstore);
1131
1132 fseq->gi = gi;
1133 }
1134
1135 /* Now fetching sequence and defline from Entrez */
1136
1137 if((sep = EntrezSeqEntryGet(gi, 1)) == NULL) {
1138 return NULL;
1139 }
1140
1141 if((bsp = find_big_bioseq(sep)) == NULL) {
1142 SeqEntryFree(sep);
1143 return NULL;
1144 }
1145
1146 StringCpy(buff, ">");
1147 SeqIdWrite(bsp->id, buff+1, PRINTID_FASTA_LONG, sizeof(buff));
1148 StringCat(buff, " ");
1149 CreateDefLine(NULL, bsp, buff+StringLen(buff),
1150 sizeof(buff)-StringLen(buff),
1151 0, NULL, NULL);
1152
1153 fseq->label = StringSave(buff);
1154 is_na = ISA_na (bsp->mol);
1155 if (is_na)
1156 code = Seq_code_iupacna;
1157 else
1158 code = Seq_code_ncbieaa;
1159 spp = FastaSeqPort(bsp, is_na, FALSE, code);
1160 fseq->seq = MemNew(ATF_INIT_BUFF_SIZE+1);
1161
1162 while (FastaSeqLine(spp, fseq->seq+SequenceLen,
1163 ATF_INIT_BUFF_SIZE, TRUE)) {
1164 SequenceLen += ATF_INIT_BUFF_SIZE;
1165 fseq->seq = Realloc(fseq->seq, SequenceLen + ATF_INIT_BUFF_SIZE + 1);
1166 }
1167
1168 SeqPortFree(spp);
1169 SeqEntryFree(sep);
1170 return fseq;
1171 }
1172
1173 struct {
1174 CharPtr *theMemory;
1175 Int4 count;
1176 } state;
1177
1178 static Boolean
collectTermsProc(CharPtr term,Int4 special,Int4 total)1179 collectTermsProc (CharPtr term, Int4 special, Int4 total)
1180 {
1181 state.theMemory[state.count++] = term;
1182
1183 return TRUE;
1184 }
1185
GetFullEntrezTermList(DocType database,DocField field,Int4Ptr count)1186 NLM_EXTERN CharPtr * LIBCALL GetFullEntrezTermList (DocType database, DocField field, Int4Ptr count)
1187 {
1188 Int4 numTerms;
1189 Int4 numPages;
1190 EntrezInfoPtr info;
1191 Int2 page;
1192
1193 if (!EntrezIsInited ())
1194 return NULL;
1195
1196 info = EntrezGetInfo ();
1197 if (info == NULL || info->type_count <= database)
1198 return NULL;
1199
1200 numTerms = info->types[database].fields[field].num_terms;
1201 numPages = info->types[database].fields[field].num_bucket;
1202 state.theMemory = (CharPtr *) MemNew (sizeof (CharPtr) * numTerms);
1203 state.count = 0;
1204
1205 for (page = 0; page < numPages; page += INT2_MAX / 2) {
1206 EntrezTermListByPage (database, field, page, MIN ((numPages - page), INT2_MAX / 2), collectTermsProc);
1207 }
1208
1209 *count = state.count;
1210 return state.theMemory;
1211 }
1212