1 /*   accutils.c
2  * ===========================================================================
3  *
4  *                            PUBLIC DOMAIN NOTICE
5  *               National Center for Biotechnology Information
6  *
7  *  This software/database is a "United States Government Work" under the
8  *  terms of the United States Copyright Act.  It was written as part of
9  *  the author's official duties as a United States Government employee and
10  *  thus cannot be copyrighted.  This software/database is freely available
11  *  to the public for use. The National Library of Medicine and the U.S.
12  *  Government have not placed any restriction on its use or reproduction.
13  *
14  *  Although all reasonable efforts have been taken to ensure the accuracy
15  *  and reliability of the software and data, the NLM and the U.S.
16  *  Government do not and cannot warrant the performance or results that
17  *  may be obtained by using this software or data. The NLM and the U.S.
18  *  Government disclaim all warranties, express or implied, including
19  *  warranties of performance, merchantability or fitness for any particular
20  *  purpose.
21  *
22  *  Please cite the author in any work or product based on this material.
23  *
24  * ===========================================================================
25  *
26  * RCS $Id: accutils.c,v 6.15 1999/04/01 13:56:34 sicotte Exp $
27  *
28  * Author:  J. Epstein
29  *
30  * Version Creation Date:   10/18/93
31  *
32  * File Description:
33  *       Utilities which make use of the Entrez "data access library"
34  *
35  * Modifications:
36  * --------------------------------------------------------------------------
37  * $Log: accutils.c,v $
38  * Revision 6.15  1999/04/01 13:56:34  sicotte
39  * Moved WHICH_db_accession,IS_ntdb_accession,IS_protdb_accession to
40  *   sequtil.ch
41  * Removed old static functions of Colombe (that are public in salutil.c)
42  * The only code left in accutils.c is the Entrez Access code.
43  *
44  * Revision 6.14  1999/03/31 21:13:47  sicotte
45  * Add info on N-accessions
46  *
47  * Revision 6.13  1999/03/31 13:34:18  sicotte
48  * in WHICH_ntdb_accession, swapped (C** and B** prot accession for embl/ddbj)
49  *
50  * Revision 6.12  1999/03/18 20:24:05  sicotte
51  * changed the define name for NC_ accession
52  *
53  * Revision 6.11  1999/03/18 20:18:18  sicotte
54  * added REFSEQ accession numbers and macro ACCN_IS_GENBANK()
55  *
56  * Revision 6.10  1999/03/18 15:34:04  sicotte
57  * Updated Accession List and added protein Accessions
58  * for functions IS_ntdb_accession and IS_protdb_accession.
59  * New function WHICH_db_accession with return code allowing
60  * to figure out the molecule type and the database from macros
61  * in accutils.h
62  *
63  * Revision 6.9  1999/02/24 16:48:09  kans
64  * added IS_ntdb_accession and IS_protdb_accession, removed NormalizeSeqAlignId
65  *
66  * Revision 6.8  1999/01/27 16:20:51  chappey
67  * update IS_ntdb_accession with AB, AJ
68  *
69  * Revision 6.7  1999/01/06 14:18:36  grisha
70  * add defines to switch ID0/ID1 usage
71  *
72  * Revision 6.6  1998/06/12 19:19:10  kans
73  * fixed unix compiler warnings
74  *
75  * Revision 6.5  1998/04/28 19:29:13  shavirin
76  * Fixed minor purify detected bug.
77  *
78  * Revision 6.4  1998/02/11 19:50:00  kans
79  * FastaSeqPort takes code parameter
80  *
81  * Revision 6.3  1997/11/14 22:13:50  vakatov
82  * [WIN32,DLL]  Added NLM_EXTERN's
83  *
84  * Revision 6.2  1997/09/12 15:28:26  chappey
85  * Revision changes in NormalizeSeqAlign
86  *
87  * Revision 6.1  1997/09/04 14:14:50  chappey
88  * Revision changes in NormalizeSeqAlign
89  *
90  * Revision 5.15  1997/08/07 16:02:40  kans
91  * Revision added NormalizeSeqAlignId (Colombe)
92  *
93  * Revision 5.14  1997/06/26 21:55:17  vakatov
94  * Revision [PC] DLL'd "ncbicdr.lib", "ncbiacc.lib", "ncbinacc.lib" and "ncbicacc.lib"
95  *
96  * Revision 5.13  1997/05/14 14:27:57  shavirin
97  * Revision Function AccessionToFasta adopted for protein accessions
98  *
99  * Revision 5.11  1997/05/13 21:11:05  shavirin
100  * Revision Changed function AccessionToFasta() to use PubMed accession indexes
101  *
102  * Revision 5.10  1997/02/04 18:58:01  epstein
103  * Revision add GetFullEntrezTermList() function
104  *
105  * Revision 5.9  1997/01/13  15:13:34  brandon
106  * changed EntrezStringToField
107  *
108  * Revision 5.8  1997/01/07  17:34:33  epstein
109  * eliminate PMENTREZ by default
110  *
111  * Revision 5.7  1996/10/01  18:19:03  shavirin
112  * Removed unused variable and fixed memory problems in AccessionToFasta()
113  *
114  * Revision 5.6  1996/09/26  17:52:45  ostell
115  * made AccessionToFasta LIBCALL
116  *
117  * Revision 5.5  1996/09/23  21:22:09  shavirin
118  * Added new function AccessionToFasta(), returning Fasta entry for
119  * given Accession or GI as a string
120  *
121  * Revision 5.2  1996/08/14  15:15:05  brandon
122  * added date parameter to tleval functions
123  *
124  * Revision 5.1  1996/07/01  14:06:35  epstein
125  * add 'join function' EntrezCommonHierAncestor()
126  *
127  * Revision 4.6  1996/05/21  17:37:31  epstein
128  * eliminate case-sensitivity
129  *
130  * Revision 4.5  1996/05/14  21:01:40  epstein
131  * remove references to FLD_MLOC
132  *
133  * Revision 4.4  1996/04/23  19:14:46  epstein
134  * memory-leak/acces cleanup, per D. Vakatov
135  *
136  * Revision 4.3  1995/10/11  13:40:37  epstein
137  * make EntrezStringToField() and EntrezFieldToString() data-driven
138  *
139  * Revision 4.2  1995/10/02  02:36:00  epstein
140  * add range-checking
141  *
142  * Revision 4.1  1995/08/24  20:44:21  epstein
143  * add more stuff for genomes
144  * ==========================================================================
145  */
146 
147 #define REVISION_STR "$Revision: 6.15 $"
148 
149 #include <accutils.h>
150 #ifndef _CDROMLIB_
151 #include <cdromlib.h>
152 #endif
153 
154 #define ERRPOST_LVL(x)               ((x) == NULL ? SEV_INFO : SEV_ERROR)
155 
156 #define LEXCHAR_LPAREN                1
157 #define LEXCHAR_RPAREN                2
158 #define LEXCHAR_LBRACKET              3
159 #define LEXCHAR_RBRACKET              4
160 #define LEXCHAR_QUOTE                 5
161 #define LEXCHAR_AND                   6
162 #define LEXCHAR_OR                    7
163 #define LEXCHAR_NOT                   8
164 #define LEXCHAR_COMMA                 9
165 #define LEXCHAR_ATSIGN               10
166 #define LEXCHAR_BACKSLASH            11
167 #define LEXCHAR_WHITESPACE           12
168 #define LEXCHAR_SEMICOLON            13
169 #define LEXCHAR_COLON                14
170 #define LEXCHAR_EOL                  15
171 #define LEXCHAR_NULL                 16
172 #define LEXCHAR_OTHER                17
173 
174 #define LEXSTATE_IDLE                 0
175 #define LEXSTATE_BACKSLASHED          1
176 #define LEXSTATE_INQUOTE              2
177 #define LEXSTATE_INQUOTE_AFTERBSLASH  3
178 #define LEXSTATE_INSTRING             4
179 #define LEXSTATE_ERROR                5
180 
181 #define LEXTOK_LPAREN                 1
182 #define LEXTOK_RPAREN                 2
183 #define LEXTOK_LBRACKET               3
184 #define LEXTOK_RBRACKET               4
185 #define LEXTOK_AND                    5
186 #define LEXTOK_OR                     6
187 #define LEXTOK_NOT                    7
188 #define LEXTOK_COMMA                  8
189 #define LEXTOK_ATSIGN                 9
190 #define LEXTOK_STRING                10
191 #define LEXTOK_RANGE                 11
192 
193 static Int2 lexPosition = 0;
194 static CharPtr lexString = NULL;
195 static Int2 lexState = LEXSTATE_IDLE;
196 
197 static ValNodePtr nextnode = NULL;
198 static ValNode nextRealNode;
199 static Int2 lastGood = -1;
200 static Int2 lastBad = -1;
201 
LexClassifyChar(Char c)202 static Int2 LexClassifyChar(Char c)
203 {
204     Int2 retval;
205 
206     switch(c) {
207     case '(':   retval = LEXCHAR_LPAREN; break;
208     case ')':   retval = LEXCHAR_RPAREN; break;
209     case '[':   retval = LEXCHAR_LBRACKET; break;
210     case ']':   retval = LEXCHAR_RBRACKET; break;
211     case '"':   retval = LEXCHAR_QUOTE; break;
212     case '&':   retval = LEXCHAR_AND; break;
213     case '|':   retval = LEXCHAR_OR; break;
214     case '-':   retval = LEXCHAR_NOT; break;
215     case ',':   retval = LEXCHAR_COMMA; break;
216     case '@':   retval = LEXCHAR_ATSIGN; break;
217     case '\\':  retval = LEXCHAR_BACKSLASH; break;
218     case ' ':
219     case '\t':  retval = LEXCHAR_WHITESPACE; break;
220     case ';':   retval = LEXCHAR_SEMICOLON; break;
221     case ':':   retval = LEXCHAR_COLON; break;
222     case '\0':  retval = LEXCHAR_NULL; break;
223     case '\r':
224     case '\n':  retval = LEXCHAR_EOL; break;
225     default:    retval = LEXCHAR_OTHER; break;
226     }
227 
228     return retval;
229 }
230 
231 /* Returns -1 if no token available, else the position of the token */
EntrezLexExpression(CharPtr str,ValNodePtr vnp)232 static Int2 EntrezLexExpression(CharPtr str, ValNodePtr vnp)
233 {
234     Int2 startPos;
235     Int2 classChar;
236     Int2 token = 0;
237     Boolean done;
238     Char c;
239     CharPtr lexToken = NULL;
240     CharPtr lexTokenStart;
241     Int2 len;
242 
243     if (str == NULL && lexString == NULL)
244         return -1;
245     if (str != NULL)
246     {
247         MemFree(lexString);
248         lexString = StringSave(str);
249         lexPosition = 0;
250         lexState = LEXSTATE_IDLE;
251     }
252     if (vnp == NULL)
253         return -1;
254 
255     len = StringLen(lexString);
256     startPos = lexPosition;
257 
258     if (lexPosition >= len)
259     {
260         lexState = LEXSTATE_ERROR;
261         token = -1;
262         lexToken = MemNew(1);
263     } else {
264         lexToken = MemNew(StringLen(&lexString[lexPosition]) + 1);
265     }
266     lexTokenStart = lexToken;
267 
268     for (done = FALSE; ! done && lexPosition <= len; lexPosition++)
269     {
270         c = lexString[lexPosition];
271         classChar = LexClassifyChar(c);
272         switch (lexState) {
273         case LEXSTATE_IDLE:
274             switch (classChar) {
275             case LEXCHAR_LPAREN:
276                 token = LEXTOK_LPAREN; done = TRUE; break;
277             case LEXCHAR_RPAREN:
278                 token = LEXTOK_RPAREN; done = TRUE; break;
279             case LEXCHAR_LBRACKET:
280                 token = LEXTOK_LBRACKET; done = TRUE; break;
281             case LEXCHAR_RBRACKET:
282                 token = LEXTOK_RBRACKET; done = TRUE; break;
283             case LEXCHAR_AND:
284                 token = LEXTOK_AND; done = TRUE; break;
285             case LEXCHAR_OR:
286                 token = LEXTOK_OR; done = TRUE; break;
287             case LEXCHAR_NOT:
288                 token = LEXTOK_NOT; done = TRUE; break;
289             case LEXCHAR_COMMA:
290                 token = LEXTOK_COMMA; done = TRUE; break;
291             case LEXCHAR_ATSIGN:
292                 token = LEXTOK_ATSIGN; done = TRUE; break;
293             case LEXCHAR_COLON:
294                 token = LEXTOK_RANGE; done = TRUE; break;
295             case LEXCHAR_QUOTE:
296                 lexState = LEXSTATE_INQUOTE; break;
297             case LEXCHAR_BACKSLASH:
298                 lexState = LEXSTATE_BACKSLASHED; break;
299             case LEXCHAR_EOL:
300             case LEXCHAR_WHITESPACE:
301                 startPos = lexPosition + 1; break;
302             case LEXCHAR_SEMICOLON:
303             case LEXCHAR_NULL:
304                 lexState = LEXSTATE_ERROR; done = TRUE; break;
305             case LEXCHAR_OTHER:
306             default:
307                 lexState = LEXSTATE_INSTRING; *lexToken++ = c; break;
308             }
309             break;
310         case LEXSTATE_BACKSLASHED:
311             switch (classChar) {
312             case LEXCHAR_NULL:
313             case LEXCHAR_EOL:
314                 *lexToken++ = '\0'; done = TRUE; lexState = LEXSTATE_IDLE; break;
315             case LEXCHAR_LPAREN:
316             case LEXCHAR_RPAREN:
317             case LEXCHAR_LBRACKET:
318             case LEXCHAR_RBRACKET:
319             case LEXCHAR_QUOTE:
320             case LEXCHAR_AND:
321             case LEXCHAR_OR:
322             case LEXCHAR_NOT:
323             case LEXCHAR_COMMA:
324             case LEXCHAR_ATSIGN:
325             case LEXCHAR_BACKSLASH:
326             case LEXCHAR_WHITESPACE:
327             case LEXCHAR_SEMICOLON:
328             case LEXCHAR_COLON:
329             case LEXCHAR_OTHER:
330             default:
331                 lexState = LEXSTATE_INSTRING; *lexToken++ = c; break;
332             }
333             break;
334         case LEXSTATE_INQUOTE:
335             switch (classChar) {
336             case LEXCHAR_QUOTE:
337                 token = LEXTOK_STRING;
338                 *lexToken++ = '\0';
339                 done = TRUE;
340                 lexState = LEXSTATE_IDLE;
341                 break;
342             case LEXCHAR_BACKSLASH:
343                 lexState = LEXSTATE_INQUOTE_AFTERBSLASH; break;
344             case LEXCHAR_NULL:
345             case LEXCHAR_EOL:
346                 lexState = LEXSTATE_ERROR; done = TRUE; break;
347             default:
348                 *lexToken++ = c; break;
349             }
350             break;
351         case LEXSTATE_INQUOTE_AFTERBSLASH:
352             switch (classChar) {
353             case LEXCHAR_NULL:
354             case LEXCHAR_EOL:
355                 lexState = LEXSTATE_ERROR; done = TRUE; break;
356             default:
357                 lexState = LEXSTATE_INQUOTE; *lexToken++ = c; break;
358             }
359             break;
360         case LEXSTATE_INSTRING:
361             switch (classChar) {
362             case LEXCHAR_WHITESPACE:
363             case LEXCHAR_SEMICOLON:
364             case LEXCHAR_NULL:
365             case LEXCHAR_EOL:
366                 token = LEXTOK_STRING;
367                 *lexToken++ = '\0';
368                 done = TRUE;
369                 lexState = LEXSTATE_IDLE;
370                 break;
371             case LEXCHAR_BACKSLASH:
372                 lexState = LEXSTATE_BACKSLASHED;
373                 break;
374             case LEXCHAR_QUOTE:
375                 lexState = LEXSTATE_INQUOTE;
376                 break;
377             case LEXCHAR_OTHER:
378                 *lexToken++ = c; break;
379             default:
380                 token = LEXTOK_STRING;
381                 *lexToken++ = '\0';
382                 done = TRUE;
383                 lexState = LEXSTATE_IDLE;
384                 lexPosition--; /* push back the last character */
385                 break;
386             }
387             break;
388         case LEXSTATE_ERROR:
389             done = TRUE;
390             break;
391         }
392     }
393 
394     vnp->choice = (Uint1) token;
395     vnp->data.ptrvalue = NULL;
396     if (token == LEXTOK_STRING)
397     {
398         vnp->data.ptrvalue = lexTokenStart;
399     } else {
400         MemFree(lexTokenStart);
401     }
402     if (lexState == LEXSTATE_ERROR)
403         return -1;
404     else
405         return startPos;
406 }
407 
StrNextNode(void)408 static void StrNextNode(void)
409 {
410     nextnode = &nextRealNode;
411 
412     if (EntrezLexExpression(NULL, nextnode) < 0)
413     {
414         nextnode = NULL;
415     }
416 
417     lastGood = lastBad;
418     lastBad = lexPosition;
419 }
420 
421 static Boolean StrExpression PROTO((ValNodePtr elst, DocType db, DocField fld));
422 
FindTermRequired(CharPtr term)423 static Boolean FindTermRequired(CharPtr term)
424 {
425     Int2 len;
426 
427     if (term == NULL)
428         return FALSE;
429     len = StrLen(term);
430     return ((len > 3 && term[len-1] == '.' && term[len-2] == '.' &&
431         term[len-3] == '.') || (len > 1 && term[len-1] == '*') ||
432         StrChr(term, '?') != NULL);
433 }
434 
435 /* traverse all the possible fields, and take the "union" of this term */
436 /* over all the possibilities for the specified database               */
AddAllFields(ValNodePtr elst,CharPtr term,DocType db,Boolean special)437 static Boolean AddAllFields(ValNodePtr elst, CharPtr term, DocType db, Boolean special)
438 {
439     EntrezInfoPtr eip;
440     Boolean first = TRUE;
441     Int4 specialCount;
442     Int4 totalCount;
443     EntrezFieldDataPtr fields;
444     DocField fld;
445     Boolean findTermRequired;
446 
447     if ((eip = EntrezGetInfo()) == NULL || elst == NULL)
448         return FALSE;
449     if (db < 0 || db >= eip->type_count)
450         return FALSE;
451     findTermRequired = FindTermRequired(term);
452     fields = eip->types[db].fields;
453     for (fld = 0; fld < eip->field_count; fld++)
454     {
455         if (fields == NULL || fields[fld].num_terms <= 0 || fld ==
456             FLD_ORGN_HIER || (db == TYP_ML && fld == FLD_PROT))
457             continue;
458         if (first)
459         {
460             EntrezTLAddLParen(elst);
461             first = FALSE;
462         } else {
463             EntrezTLAddOR(elst);
464         }
465         if (findTermRequired)
466         {
467             EntrezFindTerm(db, fld, term, &specialCount, &totalCount);
468         }
469         EntrezTLAddTerm(elst, term, db, fld, special);
470     }
471 
472     if (! first)
473     {
474         EntrezTLAddRParen(elst);
475     }
476 
477     return TRUE;
478 }
479 
480 static Boolean
StrFactor(ValNodePtr elst,DocType db,DocField fld)481 StrFactor(ValNodePtr elst, DocType db, DocField fld)
482 {
483     if (nextnode == NULL)
484     {
485         ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "Null factor");
486         return FALSE;
487     }
488     if (nextnode->choice == LEXTOK_LPAREN)
489     {
490         if (elst != NULL)
491             EntrezTLAddLParen (elst);
492         StrNextNode();
493         if (! StrExpression(elst, db, fld))
494         {
495             return FALSE;
496         }
497         if (nextnode != NULL && nextnode->choice == LEXTOK_RPAREN)
498         {
499             if (elst != NULL)
500                 EntrezTLAddRParen (elst);
501             StrNextNode();
502         } else {
503             ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "Missing right paren");
504             return FALSE;
505         }
506     } else {
507         if (nextnode->choice == LEXTOK_STRING)
508         {
509             CharPtr term;
510             Boolean special = FALSE;
511             CharPtr fldStr;
512             CharPtr highRangeStr = NULL;
513             Boolean allFields = FALSE;
514 
515             term = (CharPtr) (nextnode->data.ptrvalue);
516             StrNextNode();
517             if (nextnode != NULL && nextnode->choice == LEXTOK_RANGE)
518             {
519                 StrNextNode();
520                 if (nextnode == NULL || nextnode->choice != LEXTOK_STRING)
521                 {
522                     ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing second half of range");
523                     MemFree(term);
524                     return FALSE;
525                 }
526                 highRangeStr = (CharPtr) (nextnode->data.ptrvalue);
527                 StrNextNode();
528             }
529             if (nextnode != NULL && nextnode->choice == LEXTOK_LBRACKET)
530             {
531                 StrNextNode();
532                 if (nextnode == NULL || nextnode->choice != LEXTOK_STRING)
533                 {
534                     ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing field id after bracket");
535                     MemFree(term);
536                     MemFree(highRangeStr);
537                     return FALSE;
538                 }
539                 fldStr = (CharPtr) (nextnode->data.ptrvalue);
540                 if (fldStr != NULL && StrCmp(fldStr, "*") == 0)
541                 {
542                     allFields = TRUE;
543                 } else {
544                     fld = EntrezStringToField(db, fldStr);
545                 }
546                 MemFree(nextnode->data.ptrvalue);
547                 if (!allFields && fld < 0)
548                 {
549                     ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "bad field identifier");
550                     MemFree(term);
551                     MemFree(highRangeStr);
552                     return FALSE;
553                 }
554                 StrNextNode();
555                 if (nextnode == NULL || (nextnode->choice != LEXTOK_COMMA &&
556                     nextnode->choice != LEXTOK_RBRACKET))
557                 {
558                     ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing right bracket");
559                     MemFree(term);
560                     MemFree(highRangeStr);
561                     return FALSE;
562                 }
563                 if (nextnode->choice == LEXTOK_COMMA)
564                 {
565                     StrNextNode();
566                     if (nextnode == NULL || nextnode->choice != LEXTOK_STRING ||
567                         StringCmp(nextnode->data.ptrvalue, "S") != 0)
568                     {
569                         ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "field qualifier error");
570                         MemFree(term);
571                         MemFree(highRangeStr);
572                         return FALSE;
573                     }
574                     MemFree(nextnode->data.ptrvalue);
575                     special = TRUE;
576                     StrNextNode();
577                     if (nextnode == NULL || nextnode->choice != LEXTOK_RBRACKET)
578                     {
579                         ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "missing right bracket");
580                         MemFree(term);
581                         MemFree(highRangeStr);
582                         return FALSE;
583                     }
584                 }
585                 StrNextNode();
586             }
587 
588             /* if ( the default specified by caller is -1 ==> all ) then */
589             if (fld < 0)
590             {
591                 allFields = TRUE;
592                 if (highRangeStr != NULL)
593                 {
594                     ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "ranges require explicit field specification");
595                     MemFree(term);
596                     MemFree(highRangeStr);
597                     return FALSE;
598                 }
599             }
600 
601             if (elst != NULL)
602             {
603                 if (allFields)
604                 {
605                    AddAllFields(elst, term, db, special);
606                 } else {
607                     Int4 specialCount;
608                     Int4 totalCount;
609 
610                     if (FindTermRequired(term))
611                     {
612                         EntrezFindTerm(db, fld, term, &specialCount, &totalCount);
613                     }
614                     EntrezTLAddTermWithRange(elst, term, db, fld, special, highRangeStr);
615                 }
616             }
617             MemFree (term);
618             MemFree(highRangeStr);
619         } else {
620             ErrPostEx(ERRPOST_LVL(elst), ERR_CD_LEX, 0, "invalid token");
621             return FALSE;
622         }
623     }
624 
625     return TRUE;
626 }
627 
628 
629 static Boolean
StrTerm(ValNodePtr elst,DocType db,DocField fld)630 StrTerm(ValNodePtr elst, DocType db, DocField fld)
631 {
632     if (! StrFactor(elst, db, fld))
633         return FALSE;
634     while (nextnode != NULL && nextnode->choice == LEXTOK_AND)
635     {
636         if (elst != NULL)
637             EntrezTLAddAND(elst);
638         StrNextNode();
639         if (! StrFactor(elst, db, fld))
640             return FALSE;
641     }
642 
643     return TRUE;
644 }
645 
646 static Boolean
StrDiff(ValNodePtr elst,DocType db,DocField fld)647 StrDiff(ValNodePtr elst, DocType db, DocField fld)
648 {
649     if (! StrTerm(elst, db, fld))
650         return FALSE;
651     while (nextnode != NULL && nextnode->choice == LEXTOK_OR)
652     {
653         if (elst != NULL)
654             EntrezTLAddOR(elst);
655         StrNextNode();
656         if (! StrTerm(elst, db, fld))
657             return FALSE;
658     }
659 
660     return TRUE;
661 }
662 
663 static Boolean
StrExpression(ValNodePtr elst,DocType db,DocField fld)664 StrExpression(ValNodePtr elst, DocType db, DocField fld)
665 {
666     if (! StrDiff(elst, db, fld))
667         return FALSE;
668     while (nextnode != NULL && nextnode->choice == LEXTOK_NOT)
669     {
670         if (elst != NULL)
671             EntrezTLAddBUTNOT(elst);
672         StrNextNode();
673         if (! StrDiff(elst, db, fld))
674             return FALSE;
675     }
676 
677     return TRUE;
678 }
679 
EntrezFieldToString(DocType db,DocField fld)680 NLM_EXTERN CharPtr LIBCALL EntrezFieldToString(DocType db, DocField fld)
681 {
682     CharPtr fldStr;
683     static Char str[6];
684 
685     if (EntrezIsInited())
686     {
687         EntrezInfoPtr eip;
688 
689         if ((eip = EntrezGetInfo()) != NULL && fld < eip->field_count)
690         {
691             StrNCpy(str, eip->field_info[fld].tag, sizeof(str) - 1);
692             StringUpper(str);
693             return str;
694         }
695     }
696 
697     switch(fld)
698     {
699     case FLD_WORD:
700         fldStr = "WORD"; break;
701     case FLD_MESH:
702         fldStr = "MESH"; break;
703     case FLD_AUTH:
704         fldStr = "AUTH"; break;
705     case FLD_JOUR:
706         fldStr = "JOUR"; break;
707     case FLD_GENE:
708         fldStr = "GENE"; break;
709     case FLD_KYWD:
710         fldStr = "KYWD"; break;
711     case FLD_ECNO:
712         fldStr = "ECNO"; break;
713     case FLD_ORGN:
714         fldStr = "ORGN"; break;
715     case FLD_ACCN:
716         fldStr = "ACCN"; break;
717     case FLD_PROT:
718         fldStr = "PROT"; break;
719     case FLD_ORGN_HIER:
720         fldStr = "HIER"; break;
721     case FLD_DATE:
722         fldStr = "DATE"; break;
723     case FLD_FKEY:
724         fldStr = "FKEY"; break;
725     case FLD_PROP:
726         fldStr = "PROP"; break;
727     case FLD_SUBS:
728         fldStr = "SUBS"; break;
729     default:
730         fldStr = "????";
731     }
732 
733     return StringSave(fldStr);
734 }
735 
EntrezStringToField(DocType db,CharPtr str)736 NLM_EXTERN DocField LIBCALL EntrezStringToField(DocType db, CharPtr str)
737 {
738     if (str == NULL)
739         return -1;
740 
741 #ifdef _PMENTREZ_
742     return PMEntrezStringToField(db,str);
743 #else
744 
745     if (EntrezIsInited())
746     {
747         EntrezInfoPtr eip;
748         DocField fld;
749 
750         if ((eip = EntrezGetInfo()) != NULL)
751         {
752             for (fld = 0; fld < eip->field_count; fld++)
753             {
754                 if (StringICmp(str, eip->field_info[fld].tag) == 0)
755                     return fld;
756             }
757         }
758 
759 
760 
761     }
762 
763     if (StringICmp(str, "WORD") == 0)
764         return FLD_WORD;
765     if (StringICmp(str, "MESH") == 0)
766         return FLD_MESH;
767     if (StringICmp(str, "AUTH") == 0)
768         return FLD_AUTH;
769     if (StringICmp(str, "JOUR") == 0)
770         return FLD_JOUR;
771     if (StringICmp(str, "GENE") == 0)
772         return FLD_GENE;
773     if (StringICmp(str, "KYWD") == 0)
774         return FLD_KYWD;
775     if (StringICmp(str, "ECNO") == 0)
776         return FLD_ECNO;
777     if (StringICmp(str, "ORGN") == 0)
778         return FLD_ORGN;
779     if (StringICmp(str, "ACCN") == 0)
780         return FLD_ACCN;
781     if (StringICmp(str, "PROT") == 0)
782         return FLD_PROT;
783     if (StringICmp(str, "HIER") == 0)
784         return FLD_ORGN_HIER;
785     if (StringICmp(str, "DATE") == 0)
786         return FLD_DATE;
787     if (StringICmp(str, "FKEY") == 0)
788         return FLD_FKEY;
789     if (StringICmp(str, "PROP") == 0)
790         return FLD_PROP;
791     if (StringICmp(str, "SUBS") == 0)
792         return FLD_SUBS;
793     return -1;
794 #endif
795 }
796 
797 
EntrezPMTLEvalString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)798 NLM_EXTERN LinkSetPtr LIBCALL EntrezPMTLEvalString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
799 {
800     LinkSetPtr lsp;
801     ValNodePtr elst;
802 
803     if (begin != NULL)
804     {
805         *begin = -1;
806     }
807     if (end != NULL)
808     {
809         *end = -1;
810     }
811 
812     if (str == NULL || *str == '\0')
813     {
814         return NULL;
815     }
816 
817     if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
818     {
819         return NULL;
820     }
821 
822     if ((elst = EntrezTLNew(db)) == NULL)
823     {
824         return NULL;
825     }
826 
827     EntrezLexExpression(str, NULL);
828     StrNextNode();
829 
830     lsp = NULL;
831     if (StrExpression(elst, db, fld) && nextnode == NULL)
832     {
833 #ifdef _PMENTREZ_
834       lsp = EntrezPMTLEval(elst,edc);
835 #else
836       lsp = EntrezTLEval(elst);
837 #endif /* _PMENTREZ_ */
838     }
839     EntrezTLFree(elst);
840 
841     if (lastGood < lastBad)
842     {
843         lastGood = lastBad;
844     }
845     if (begin != NULL)
846     {
847         *begin = lastGood;
848     }
849     if (end != NULL)
850     {
851         *end = lastBad;
852     }
853 
854     return lsp;
855 }
856 
EntrezTLEvalString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)857 NLM_EXTERN LinkSetPtr LIBCALL EntrezTLEvalString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
858 {
859   return EntrezPMTLEvalString(str,db,fld,begin,end,NULL);
860 }
861 
EntrezPMTLEvalXString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)862 NLM_EXTERN ByteStorePtr LIBCALL EntrezPMTLEvalXString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
863 {
864     ByteStorePtr bsp;
865     ValNodePtr elst;
866 
867     if (begin != NULL)
868     {
869         *begin = -1;
870     }
871     if (end != NULL)
872     {
873         *end = -1;
874     }
875 
876     if (str == NULL || *str == '\0')
877     {
878         return NULL;
879     }
880 
881     if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
882     {
883         return NULL;
884     }
885 
886     if ((elst = EntrezTLNew(db)) == NULL)
887     {
888         return NULL;
889     }
890 
891     EntrezLexExpression(str, NULL);
892     StrNextNode();
893 
894     bsp = NULL;
895     if (StrExpression(elst, db, fld) && nextnode == NULL)
896     {
897 #ifdef _PMENTREZ_
898       bsp = EntrezPMTLEvalX(elst,edc);
899 #else
900       bsp = EntrezTLEvalX(elst);
901 #endif
902     }
903 
904     EntrezTLFree(elst);
905 
906     if (lastGood < lastBad)
907     {
908         lastGood = lastBad;
909     }
910     if (begin != NULL)
911     {
912         *begin = lastGood;
913     }
914     if (end != NULL)
915     {
916         *end = lastBad;
917     }
918 
919     return bsp;
920 }
921 
EntrezTLEvalXString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)922 NLM_EXTERN ByteStorePtr LIBCALL EntrezTLEvalXString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
923 {
924   return EntrezPMTLEvalXString(str,db,fld,begin,end,NULL);
925 }
926 
EntrezPMTLEvalCountString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)927 NLM_EXTERN Int4 LIBCALL EntrezPMTLEvalCountString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
928 {
929     Int4 count;
930     ValNodePtr elst;
931 
932     if (begin != NULL)
933     {
934         *begin = -1;
935     }
936     if (end != NULL)
937     {
938         *end = -1;
939     }
940 
941     if (str == NULL || *str == '\0')
942     {
943         return 0;
944     }
945 
946     if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
947     {
948         return 0;
949     }
950 
951     if ((elst = EntrezTLNew(db)) == NULL)
952     {
953         return 0;
954     }
955 
956     EntrezLexExpression(str, NULL);
957     StrNextNode();
958 
959     count = 0;
960     if (StrExpression(elst, db, fld) && nextnode == NULL)
961     {
962 #ifdef _PMENTREZ_
963       count = EntrezPMTLEvalCount(elst,edc);
964 #else
965       count = EntrezTLEvalCount(elst);
966 #endif /* _PMENTREZ_ */
967     }
968     EntrezTLFree(elst);
969 
970     if (lastGood < lastBad)
971     {
972         lastGood = lastBad;
973     }
974     if (begin != NULL)
975     {
976         *begin = lastGood;
977     }
978     if (end != NULL)
979     {
980         *end = lastBad;
981     }
982 
983     return count;
984 }
985 
EntrezTLEvalCountString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)986 NLM_EXTERN Int4 LIBCALL EntrezTLEvalCountString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
987 {
988   return EntrezPMTLEvalCountString(str,db,fld,begin,end,NULL);
989 }
990 
991 
EntrezPMTLParseString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end,void * edc)992 NLM_EXTERN Boolean LIBCALL EntrezPMTLParseString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end, void * edc)
993 {
994     Boolean retval;
995 
996     if (begin != NULL)
997     {
998         *begin = -1;
999     }
1000     if (end != NULL)
1001     {
1002         *end = -1;
1003     }
1004 
1005     if (str == NULL || *str == '\0')
1006     {
1007         return FALSE;
1008     }
1009 
1010     if (db != TYP_ML && db != TYP_SEQ && db != TYP_AA && db != TYP_NT && db != TYP_ST && db != TYP_CH)
1011     {
1012         return FALSE;
1013     }
1014     EntrezLexExpression(str, NULL);
1015     StrNextNode();
1016 
1017     retval = StrExpression(NULL, db, fld) && nextnode == NULL;
1018 
1019     if (lastGood < lastBad)
1020     {
1021         lastGood = lastBad;
1022     }
1023     if (begin != NULL)
1024     {
1025         *begin = lastGood;
1026     }
1027     if (end != NULL)
1028     {
1029         *end = lastBad;
1030     }
1031 
1032     return retval;
1033 }
1034 
EntrezTLParseString(CharPtr str,DocType db,DocField fld,Int2Ptr begin,Int2Ptr end)1035 NLM_EXTERN Boolean LIBCALL EntrezTLParseString(CharPtr str, DocType db, DocField fld, Int2Ptr begin, Int2Ptr end)
1036 {
1037   return EntrezPMTLParseString(str,db,fld,begin,end,NULL);
1038 }
1039 
1040 
EntrezCommonHierAncestor(EntrezHierarchyPtr ehp1,EntrezHierarchyPtr ehp2,Int2Ptr distance1,Int2Ptr distance2)1041 NLM_EXTERN CharPtr LIBCALL EntrezCommonHierAncestor(EntrezHierarchyPtr ehp1, EntrezHierarchyPtr ehp2, Int2Ptr distance1, Int2Ptr distance2)
1042 {
1043     Int2 count;
1044     CharPtr retval;
1045 
1046     if (ehp1 == NULL || ehp2 == NULL || ehp1->lineage == NULL || ehp2->lineage == NULL)
1047         return NULL;
1048 
1049     for (count = 0; count < ehp1->numInLineage && count < ehp2->numInLineage;
1050          count++) {
1051         if (StrCmp(ehp1->lineage[count], ehp2->lineage[count]) != 0)
1052             break;
1053     }
1054 
1055     if (count <= 0)
1056         return NULL;
1057     count--;
1058     retval = StringSave(ehp1->lineage[count]);
1059 
1060     if (distance1 != NULL)
1061         *distance1 = ehp1->numInLineage - count;
1062     if (distance2 != NULL)
1063         *distance2 = ehp2->numInLineage - count;
1064 
1065     return retval;
1066 }
1067 
1068 /*****************************************************************************
1069 *
1070 *  Function:	AccessionToFasta
1071 *
1072 *  Description: Returns Fasta entry for given Accession or GI number
1073 *
1074 *****************************************************************************/
1075 #define ATF_INIT_BUFF_SIZE 256
1076 
AccessionToFasta(CharPtr string)1077 NLM_EXTERN FastaSeqPtr LIBCALL AccessionToFasta(CharPtr string)
1078 {
1079     Int4 gi=0;
1080     SeqEntryPtr sep;
1081     BioseqPtr bsp = NULL;
1082     SeqPortPtr spp;
1083     FastaSeqPtr fseq;
1084     Char buff[512];
1085     Int4 SequenceLen = 0;
1086     Uint1 code;
1087     Boolean is_na;
1088 
1089     CharPtr str;
1090     ByteStorePtr bstore;
1091     Int4 GiNum;
1092 
1093     if(string == NULL)
1094         return NULL;
1095 
1096     if((fseq = MemNew(sizeof(FastaSeq))) == NULL)
1097         return NULL;
1098 
1099     fseq->label = NULL;
1100     fseq->seq = NULL;
1101 
1102     if((gi = atol(string)) > 0) {
1103         fseq->gi = gi;
1104     } else {
1105 
1106         str = (CharPtr) MemNew(StringLen(string)+30);
1107         sprintf(str, "\"%s\"[ACCN]", string);
1108 
1109         if((bstore = EntrezTLEvalXString(str, TYP_NT,
1110                                          -1, NULL, NULL)) == NULL ||
1111            (GiNum = BSLen(bstore)/sizeof(DocUid)) != 1) {
1112 
1113             /* Try protein accessions */
1114 
1115             if((bstore = EntrezTLEvalXString(str, TYP_AA,
1116                                              -1, NULL, NULL)) == NULL ||
1117                (GiNum = BSLen(bstore)/sizeof(DocUid)) != 1) {
1118 
1119                 MemFree(fseq);
1120                 MemFree(str);
1121                 return NULL;
1122             }
1123         }
1124 
1125         MemFree(str);
1126 
1127         BSSeek(bstore, 0L, 0);
1128 
1129         BSRead(bstore, &gi, sizeof(Int4));
1130         BSFree(bstore);
1131 
1132         fseq->gi = gi;
1133     }
1134 
1135     /* Now fetching sequence and defline from Entrez */
1136 
1137     if((sep = EntrezSeqEntryGet(gi, 1)) == NULL) {
1138         return NULL;
1139     }
1140 
1141     if((bsp = find_big_bioseq(sep)) == NULL) {
1142       SeqEntryFree(sep);
1143       return NULL;
1144     }
1145 
1146     StringCpy(buff, ">");
1147     SeqIdWrite(bsp->id, buff+1, PRINTID_FASTA_LONG, sizeof(buff));
1148     StringCat(buff, " ");
1149     CreateDefLine(NULL, bsp, buff+StringLen(buff),
1150                   sizeof(buff)-StringLen(buff),
1151                   0, NULL, NULL);
1152 
1153     fseq->label = StringSave(buff);
1154     is_na = ISA_na (bsp->mol);
1155     if (is_na)
1156         code = Seq_code_iupacna;
1157     else
1158         code = Seq_code_ncbieaa;
1159     spp = FastaSeqPort(bsp, is_na, FALSE, code);
1160     fseq->seq = MemNew(ATF_INIT_BUFF_SIZE+1);
1161 
1162     while (FastaSeqLine(spp, fseq->seq+SequenceLen,
1163                         ATF_INIT_BUFF_SIZE, TRUE)) {
1164         SequenceLen += ATF_INIT_BUFF_SIZE;
1165         fseq->seq = Realloc(fseq->seq, SequenceLen + ATF_INIT_BUFF_SIZE + 1);
1166     }
1167 
1168     SeqPortFree(spp);
1169     SeqEntryFree(sep);
1170     return fseq;
1171 }
1172 
1173 struct {
1174   CharPtr        *theMemory;
1175                   Int4 count;
1176 }               state;
1177 
1178 static          Boolean
collectTermsProc(CharPtr term,Int4 special,Int4 total)1179                 collectTermsProc (CharPtr term, Int4 special, Int4 total)
1180 {
1181   state.theMemory[state.count++] = term;
1182 
1183   return TRUE;
1184 }
1185 
GetFullEntrezTermList(DocType database,DocField field,Int4Ptr count)1186 NLM_EXTERN CharPtr        * LIBCALL GetFullEntrezTermList (DocType database, DocField field, Int4Ptr count)
1187 {
1188   Int4            numTerms;
1189   Int4            numPages;
1190   EntrezInfoPtr   info;
1191   Int2 page;
1192 
1193   if (!EntrezIsInited ())
1194     return NULL;
1195 
1196   info = EntrezGetInfo ();
1197   if (info == NULL || info->type_count <= database)
1198     return NULL;
1199 
1200   numTerms = info->types[database].fields[field].num_terms;
1201   numPages = info->types[database].fields[field].num_bucket;
1202   state.theMemory = (CharPtr *) MemNew (sizeof (CharPtr) * numTerms);
1203   state.count = 0;
1204 
1205   for (page = 0; page < numPages; page += INT2_MAX / 2) {
1206     EntrezTermListByPage (database, field, page, MIN ((numPages - page), INT2_MAX / 2), collectTermsProc);
1207   }
1208 
1209   *count = state.count;
1210   return state.theMemory;
1211 }
1212