1 /* $Id: qbatch.c,v 6.12 2005/10/13 13:53:51 kans Exp $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  $RCSfile: qbatch.c,v $
27 *
28 * Author:  Sergei Shavirin
29 *
30 * Version Creation Date: 05/04/2000
31 *
32 * $Revision: 6.12 $
33 *
34 * File Description:
35 *         WWW and Command-line Batch Entrez using Entre2 and ID1
36 *
37 * $Log: qbatch.c,v $
38 * Revision 6.12  2005/10/13 13:53:51  kans
39 * commented out depracated EntrezSetServer function call
40 *
41 * Revision 6.11  2004/08/13 02:01:21  beloslyu
42 * Changes for FreeBSD
43 *
44 * Revision 6.10  2002/10/24 17:37:36  ucko
45 * Kludge around Darwin's lack of ctime_r.
46 *
47 * Revision 6.9  2001/03/19 16:18:17  beloslyu
48 * fix the args for ctime_r for OSF1 on alpha
49 *
50 * Revision 6.8  2000/08/25 21:02:15  shavirin
51 * Changed Flat file printing from DUMP_MODE to RELEASE_MODE.
52 *
53 * Revision 6.7  2000/06/21 14:34:35  beloslyu
54 * fix the args for ctime_r on linux
55 *
56 * Revision 6.6  2000/05/31 12:37:17  kans
57 * removed SwapUint4, which somehow got back into the code
58 *
59 * Revision 6.5  2000/05/30 16:20:28  kans
60 * removed cvs merge when incorrect endian code was removed
61 *
62 * Revision 6.3  2000/05/26 18:05:53  shavirin
63 * Added protection against big-little endians for uids from Entrez.
64 *
65 * Revision 6.2  2000/05/09 13:37:01  shavirin
66 * Use functions IS_ntdb_accession() and IS_protdb_accession() to
67 * verify accession number.
68 *
69 * Revision 6.1  2000/05/04 21:14:33  shavirin
70 * Initial revision.
71 *
72 *
73 * ==========================================================================
74 */
75 
76 #include <ncbi.h>
77 #include <sequtil.h>
78 #include <asn2ff.h>
79 #include <tofasta.h>
80 #include <ffprint.h>
81 #include <ent2api.h>
82 #include <accid1.h>
83 
84 #define DB_NUCLEOTIDE "Nucleotide"
85 #define DB_PROTEIN    "Protein"
86 #define DB_MEDLINE    "Medline"
87 
88 #define NUMARGS (sizeof(BE_args)/sizeof(BE_args[0]))
89 
90 Args BE_args[] = {
91   {"Database\n"
92    "         0 - Nucleotide  \n"
93    "         1 - Protein",
94    "0", NULL, NULL, FALSE,'s',ARG_INT,0.0,0,NULL},
95   {"Format of output\n"
96    "         0 - GenBank/GenPept   \n"
97    "         1 - FASTA \n"
98    "         2 - ASN.1 \n"
99    "         3 - List of GIs\n"
100    "         4 - List of Deflines\n"
101    "         5 - Single GenBank Bioseq-set ASN.1",
102    "0", NULL,NULL,FALSE,'f',ARG_INT, 0.0,0,NULL},
103   {"Number of returned entries from single gi/accession\n"
104    "         0 - Return single entry specified by gi/accession  \n"
105    "         1 - Return all entries in SeqEntry, that available",
106    "0", NULL,NULL,FALSE,'n',ARG_INT,0.0,0,NULL},
107   {"Text or HTML? (For GenBank/GenPept)\n"
108    "         0 - Text output   \n"
109    "         1 - HTML output",
110    "0", NULL,NULL,FALSE,'h',ARG_INT, 0.0,0,NULL},
111   {"Query string",
112    NULL, NULL,NULL,TRUE,'q',ARG_STRING, 0.0,0,NULL},
113   {"ID (accession or gi)",
114    NULL, NULL,NULL,TRUE,'u',ARG_STRING, 0.0,0,NULL},
115   {"File with list of GIS/Accessions",
116    "stdin", NULL,NULL,TRUE,'i',ARG_FILE_IN, 0.0,0,NULL},
117   {"Logfile name:",
118    "stdout", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL}
119 };
120 
121 #define F_GEN       0
122 #define F_FASTA     1
123 #define F_ASN1      2
124 #define F_GILIST    3
125 #define F_DLIST     4
126 #define F_ASN1_GENB 5
127 
128 #define REQ_DEFAULT        0
129 #define REQ_LIST_OF_GIS    1
130 #define REQ_ADVANCED_QUERY 2
131 #define REQ_ORGANISM       3
132 
133 #define SQL_SRV_NAME "PUBSEQ_OS"
134 
135 #define QSRV_LOGFILE_NAME     "qserver.log"
136 #define QSRV_LOGFILE_STD_NAME "/tmp/qserver_std.log"
137 
138 typedef VoidPtr QSRVHandlePtr;
139 
140 static CharPtr BE_Dbname[] = {"Nucleotide", "Protein", "Medline"};
141 
142 typedef struct BEData {
143     Int4     database;
144     CharPtr  query;
145     CharPtr  uids;
146     Int4     format;
147     Int4     allset;
148     Int4     request_type;
149     Boolean  html;
150     Int4     savetodisk;
151     Int4     maxdocs;
152     Int4     noheader;
153     Int4     commandline;
154     Int4     count;
155 } BEData, PNTR BEDataPtr;
156 
157 typedef struct BGenBank {
158     AsnIoPtr   aip;
159     AsnTypePtr atp;
160     AsnTypePtr atp_bioseq_set_seq_set;
161     AsnTypePtr atp_bioseq_set;
162 } BGenBank, PNTR BGenBankPtr;
163 
164 #define MACRO_atp_find(atp,name)\
165         if((atp = AsnTypeFind(amp, #name))==NULL){\
166                 ErrPostEx(SEV_ERROR,0,0,\
167                         "Could not find type <%s>", #name);\
168                 return NULL; \
169         }
170 
BESeqEntryGet(Int4 gi)171 SeqEntryPtr BESeqEntryGet(Int4 gi)
172 {
173     SeqEntryPtr sep = NULL;
174 
175     if((sep = ID1SeqEntryGet(gi, 0)) == NULL) {
176         fprintf(stderr, "Sequence ID %d cannot be retrieved "
177                 "from the database", gi);
178     }
179 
180     return sep;
181 }
182 
BGenBankInit(void)183 BGenBankPtr BGenBankInit(void)
184 {
185     BGenBankPtr bgbp;
186     AsnModulePtr amp;
187 
188     AsnTypePtr       atp_bioseq_set;
189     AsnTypePtr       atp_bioseq_set_level;
190     AsnTypePtr       atp_bioseq_set_class;
191     AsnTypePtr       atp_bioseq_set_release;
192     AsnTypePtr       atp_bioseq_set_date;
193     AsnTypePtr       atp_bioseq_set_seq_set;
194     AsnTypePtr       atp_bioseq_set_seq_set_E;
195 
196     Char  release[] = "Q-server Production";
197     Char  date_time[128];
198     NCBI_Date       date;
199     DataVal dv;
200 
201     DateClean(&date);
202 
203     bgbp = (BGenBankPtr)MemNew(sizeof(BGenBank));
204     bgbp->aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
205 
206     amp = AsnAllModPtr();
207 
208     MACRO_atp_find(atp_bioseq_set,Bioseq-set);
209     MACRO_atp_find(atp_bioseq_set_level,Bioseq-set.level);
210     MACRO_atp_find(atp_bioseq_set_class,Bioseq-set.class);
211     MACRO_atp_find(atp_bioseq_set_release,Bioseq-set.release);
212     MACRO_atp_find(atp_bioseq_set_date,Bioseq-set.date);
213     MACRO_atp_find(atp_bioseq_set_seq_set,Bioseq-set.seq-set);
214     MACRO_atp_find(atp_bioseq_set_seq_set_E,Bioseq-set.seq-set.E);
215 
216     if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set,NULL))
217         return NULL;
218     dv.intvalue = 0;
219 
220     if(!AsnWrite(bgbp->aip,atp_bioseq_set_level,&dv))
221         return NULL;
222     dv.intvalue = 7;
223 
224     if(!AsnWrite(bgbp->aip,atp_bioseq_set_class,&dv))
225         return NULL;
226     dv.ptrvalue = &release;
227 
228     if(!AsnWrite(bgbp->aip,atp_bioseq_set_release,&dv))
229         return NULL;
230 
231     Nlm_DayTimeStr(date_time,TRUE,TRUE);
232     date.str=date_time;
233 
234     if(!DateAsnWrite(&date,bgbp->aip,atp_bioseq_set_date))
235         return NULL;
236 
237     if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set_seq_set,NULL))
238         return NULL;
239     AsnIoFlush(bgbp->aip);
240 
241     bgbp->atp =atp_bioseq_set_seq_set_E;
242     bgbp->atp_bioseq_set_seq_set = atp_bioseq_set_seq_set;
243     bgbp->atp_bioseq_set = atp_bioseq_set;
244 
245     return bgbp;
246 }
BGenBankClose(BGenBankPtr bgbp)247 void BGenBankClose(BGenBankPtr bgbp)
248 {
249     AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set_seq_set,NULL);
250     AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set,NULL);
251 
252     AsnIoClose(bgbp->aip);
253     MemFree(bgbp);
254     return;
255 }
256 
BEPrintIds(BEDataPtr pBdata,Uint4 * ids,int count)257 void BEPrintIds(BEDataPtr pBdata, Uint4 *ids, int count)
258 {
259     Int4 i;
260     SeqEntryPtr sep, sep_all;
261     Boolean retvalue = TRUE;
262     SeqIdPtr sip = NULL;
263     BioseqPtr bsp;
264     BGenBankPtr bgbp;
265     AsnIoPtr aip;
266     Boolean is_na = FALSE;
267 
268     if(pBdata->format == F_GILIST) {
269         for(i = 0; i < count; i++)
270             fprintf(stdout, "%d\n", (int) ids[i]);
271         return;
272     }
273 
274     if(pBdata->database == 0)
275         is_na = TRUE;
276 
277     if(pBdata->format == F_ASN1_GENB)
278         bgbp = BGenBankInit();
279 
280     for(i = 0; i < count; i++) {
281 
282         sep_all = BESeqEntryGet(ids[i]);
283 
284         if(sep_all == NULL) {
285             ErrPostEx(SEV_ERROR, 88, 67, "Retrieving of blob for the "
286                       "gi=%d failed", (int)ids[i]);
287             continue;
288         }
289 
290         if(!pBdata->allset) {
291             ObjMgrRegister(OBJ_SEQENTRY, sep_all);
292             sip = ValNodeNew(NULL);
293             sip->choice = SEQID_GI;
294             sip->data.intvalue = ids[i];
295 
296             if((bsp = BioseqFind(sip)) == NULL) {
297                 ErrPostEx(SEV_ERROR, 88, 67,
298                           "Error finding bioseq for gi=%d\n", (int)ids[i]);
299                 continue;
300             }
301 
302             sep = SeqEntryNew();
303             sep->choice = 1; /* Bioseq */
304             sep->data.ptrvalue = bsp;
305         } else {
306             sep = sep_all;
307         }
308 
309         switch(pBdata->format) {
310         case F_FASTA:               /* 1 */
311 
312             if(!SeqEntryToFasta(sep, stdout, is_na)) {
313                 if(!SeqEntryToFasta(sep, stdout, !is_na)) {
314                     ErrPostEx(SEV_ERROR, 88, 67, "Printing of FASTA format "
315                               "(gi=%d) failed\r\n", (int)ids[i]);
316                 }
317             }
318 
319             break;
320         case F_ASN1:                /* 2 */
321 
322             aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
323             SeqEntryAsnWrite(sep, aip, NULL);
324             AsnIoClose(aip);
325 
326             break;
327         case F_GILIST:              /* 3 */
328             for(i = 0; i < count; i++)
329                 fprintf(stdout, "%d\n", (int) ids[i]);
330             break;
331         case F_DLIST:               /* 4 */
332             if (IS_Bioseq(sep))
333                 retvalue = SeqEntrysToDefline(sep, stdout, is_na, 3);
334             else
335                 retvalue = SeqEntrysToDefline(sep, stdout, is_na, 0);
336 
337             if(retvalue == FALSE) {
338                 if (IS_Bioseq(sep))
339                     retvalue = SeqEntrysToDefline(sep, stdout, !is_na, 3);
340                 else
341                     retvalue = SeqEntrysToDefline(sep, stdout, !is_na, 0);
342             }
343             break;
344         case F_ASN1_GENB:           /* 5 */
345             retvalue = SeqEntryAsnWrite(sep, bgbp->aip, bgbp->atp);
346             break;
347         default:
348         case F_GEN:                 /* 0 */
349             if(!SeqEntryToFlatEx(sep_all, stdout,
350                                  is_na ? GENBANK_FMT : GENPEPT_FMT,
351                                  RELEASE_MODE, sip, FF_REGULAR)) {
352 
353                 if(!SeqEntryToFlatEx(sep_all, stdout,
354                                      is_na ? GENPEPT_FMT : GENBANK_FMT,
355                                      RELEASE_MODE, sip, FF_REGULAR)) {
356                 }
357             }
358             break;
359         }
360 
361         SeqEntryFree(sep_all);
362         ValNodeFree(sip);
363     }
364 
365     if(pBdata->format == F_ASN1_GENB)
366         BGenBankClose(bgbp);
367 
368     return;
369 }
BEFreeCLParam(BEDataPtr pBdata)370 void BEFreeCLParam(BEDataPtr pBdata)
371 {
372     MemFree(pBdata->query);
373     MemFree(pBdata->uids);
374     MemFree(pBdata);
375 
376     return;
377 }
BEMakeCLParam(void)378 BEDataPtr BEMakeCLParam(void)
379 {
380     BEDataPtr pBdata;
381     FILE *fd;
382 
383     if (!GetArgs ("qserver", NUMARGS, BE_args))
384         return NULL;
385 
386     if(!ErrSetLogfile (BE_args[7].strvalue, ELOG_APPEND))
387         exit(1);
388 
389     pBdata = (BEDataPtr)MemNew(sizeof(BEData));
390 
391     pBdata->database      = BE_args[0].intvalue;
392     pBdata->format        = BE_args[1].intvalue;
393     pBdata->allset        = BE_args[2].intvalue;
394     pBdata->html          = (Uchar) BE_args[3].intvalue;
395     pBdata->query         = StringSave(BE_args[4].strvalue);
396 
397     if(BE_args[5].strvalue != NULL)
398         pBdata->uids = StringSave(BE_args[5].strvalue);
399     else if(BE_args[4].strvalue == NULL) {
400         fd = FileOpen(BE_args[6].strvalue, "r");
401         pBdata->uids = WWWReadFileInMemory(fd, 0, FALSE);
402         FileClose(fd);
403     }
404 
405     pBdata->commandline   = TRUE;
406 
407     if(pBdata->query == NULL && pBdata->uids == NULL) {
408         MemFree(pBdata);
409         ErrPostEx(SEV_ERROR, 88, 0,
410                   "Error in reading parameters. "
411                   "Please check, that query string was set\n");
412         return NULL;
413     }
414 
415     return pBdata;
416 }
CleanCRLF(CharPtr query)417 static void CleanCRLF(CharPtr query)
418 {
419     CharPtr chptr;
420 
421     if(query == NULL)
422         return;
423 
424     for(chptr = query; *chptr != NULLB; chptr++) {
425         if(*chptr == '\n' || *chptr == '\r')
426             *chptr = ' ';
427     }
428     return;
429 }
430 
BEMakeWWWParam(WWWInfoPtr info)431 BEDataPtr BEMakeWWWParam(WWWInfoPtr info)
432 {
433     BEDataPtr pBdata;
434     CharPtr   chptr;
435     Char tmp[512];
436 
437     pBdata = (BEDataPtr)MemNew(sizeof(BEData));
438 
439     /* Database to search */
440 
441     if((chptr = WWWGetValueByName(info, "DATABASE")) == NULL) {
442         if((chptr = WWWGetValueByName(info, "DB")) == NULL)
443             if((chptr = WWWGetValueByName(info, "DATALIB")) == NULL) {
444                 chptr = "n"; /* Default to nucleotides */
445             }
446     }
447 
448     switch(*chptr) {
449     case 'n':
450         pBdata->database = 0;
451         break;
452     case 'p':
453         pBdata->database = 1;
454         break;
455     default:
456         pBdata->database = 0;
457         break;
458     }
459 
460     if((chptr = WWWGetValueByName(info, "REQUEST_TYPE")) != NULL) {
461         if(!StringICmp(chptr, "LIST_OF_GIS"))
462             pBdata->request_type = REQ_LIST_OF_GIS;
463         else if(!StringICmp(chptr, "ADVANCED_QUERY"))
464             pBdata->request_type = REQ_ADVANCED_QUERY;
465         else if(!StringICmp(chptr, "ORGANISM"))
466             pBdata->request_type = REQ_ORGANISM;
467     } else {
468         pBdata->request_type = REQ_DEFAULT;
469     }
470 
471     switch(pBdata->request_type) {
472     case REQ_ADVANCED_QUERY:
473     case REQ_LIST_OF_GIS:
474     case REQ_DEFAULT:
475         /* Query string */
476 
477         if((chptr = WWWGetValueByName(info, "TERM")) == NULL)
478             chptr = WWWGetValueByName(info, "QUERY");
479 
480         if(chptr != NULL && *chptr != NULLB) {
481             pBdata->query = StringSave(chptr);
482             CleanCRLF(pBdata->query);
483         }
484 
485         /* List of UIDs */
486 
487         if((chptr = WWWGetValueByName(info, "UID")) != NULL &&
488            *chptr != NULLB) {
489             pBdata->uids = StringSave(chptr);
490         }
491         break;
492     case REQ_ORGANISM:
493         /* Query for organism retrieval */
494 
495         if(((chptr = WWWGetValueByName(info, "ORGNAME")) != NULL &&
496             *chptr != NULLB) ||
497            ((chptr = WWWGetValueByName(info, "LIST_ORG")) != NULL &&
498             *chptr != NULLB && StringICmp (chptr, "(None)"))) {
499             sprintf(tmp, "%s[ORGN]", chptr);
500             pBdata->query = StringSave(tmp);
501         }
502         break;
503     }
504 
505     /* Checking for data consistency */
506 
507     if(pBdata->request_type == REQ_LIST_OF_GIS && pBdata->uids == NULL)
508         goto fail_return;
509     if((pBdata->request_type == REQ_ADVANCED_QUERY ||
510         pBdata->request_type == REQ_ORGANISM) && pBdata->query == NULL)
511         goto fail_return;
512     if(pBdata->query == NULL && pBdata->uids == NULL)
513         goto fail_return;
514 
515     /* Format of output */
516 
517     if((chptr = WWWGetValueByName(info, "FORMAT")) != NULL ||
518        (chptr = WWWGetValueByName(info, "DOPT")) != NULL) {
519         pBdata->format = atoi(chptr);
520     } else {
521         pBdata->format = F_GILIST;
522     }
523 
524     /* HTML output */
525 
526     if((chptr = WWWGetValueByName(info, "HTML")) != NULL) {
527         if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
528            !StringICmp(chptr, "0"))
529             pBdata->html = FALSE;
530         else
531             pBdata->html = TRUE;
532     }
533 
534     /* Output type */
535 
536     if((chptr = WWWGetValueByName(info, "SAVETO")) != NULL) {
537         if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
538            !StringICmp(chptr, "0"))
539             pBdata->savetodisk = FALSE;
540         else
541             pBdata->savetodisk = TRUE;
542     }
543 
544 
545     if((chptr = WWWGetValueByName(info, "ALLSET")) != NULL) {
546         if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
547            !StringICmp(chptr, "0"))
548             pBdata->allset = FALSE;
549         else
550             pBdata->allset = TRUE;
551     }
552 
553     if((chptr = WWWGetValueByName(info, "MAXDOCS")) != NULL ||
554        (chptr = WWWGetValueByName(info, "DISPMAX")) != NULL)
555         pBdata->maxdocs = atol(chptr);
556 
557     if((chptr = WWWGetValueByName(info, "NOHEADER")) != NULL) {
558         if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
559            !StringICmp(chptr, "0"))
560             pBdata->noheader = FALSE;
561         else
562             pBdata->noheader = TRUE;
563     }
564 
565     return pBdata;
566 
567  fail_return:
568     MemFree(pBdata);
569     return NULL;
570 }
571 
572 /* This function is interface to the Entrez2 engine. It may be used
573    to get list of gis corresponding to the Entrez Boolean string or
574    just number of such hits in the Entrez database */
575 
BEGetUidsFromQuery(CharPtr query,Uint4Ptr PNTR uids,Boolean is_na,Boolean count_only)576 static Int4 BEGetUidsFromQuery(CharPtr query, Uint4Ptr PNTR uids,
577                                Boolean is_na, Boolean count_only)
578 {
579     Entrez2ReplyPtr e2ry;
580     Entrez2RequestPtr  e2rq;
581     E2ReplyPtr e2rp;
582     Int4 count = 0, i;
583     Entrez2BooleanReplyPtr e2br;
584     Entrez2IdListPtr e2idlist;
585 
586     *uids = NULL;
587 
588     EntrezSetProgramName ("BLAST API");
589     /* EntrezSetServer ("www.ncbi.nlm.nih.gov", 80,
590                      "/entrez/utils/entrez2server.fcgi"); */
591 
592     e2rq = EntrezCreateBooleanRequest (!count_only, FALSE,
593                                        is_na? "Nucleotide" : "Protein",
594                                        query, 0, 0, NULL, 0, 0);
595 
596     e2ry = EntrezSynchronousQuery (e2rq);
597 
598     if (e2ry == NULL) {
599         ErrPostEx(SEV_ERROR, 0, 0,
600                   "NULL returned from EntrezSynchronousQuery()");
601         return -1;
602     }
603 
604     if((e2rp = e2ry->reply) == NULL) {
605         ErrPostEx(SEV_ERROR, 0, 0, "Invalid ASN.1: E2ReplyPtr==NULL");
606         return -1;
607     }
608 
609     switch(e2rp->choice) {
610 
611     case E2Reply_error:
612         ErrPostEx(SEV_ERROR, 0, 0, (CharPtr) e2rp->data.ptrvalue);
613         count = -1;
614         break;
615     case E2Reply_eval_boolean:
616         e2br = (Entrez2BooleanReplyPtr) e2rp->data.ptrvalue;
617         count = e2br->count;
618         if((e2idlist = e2br->uids) != NULL) {
619             count = e2idlist->num;
620             *uids = MemNew(sizeof(Int4)*count);
621             BSSeek((ByteStorePtr) e2idlist->uids, 0, SEEK_SET);
622             BSRead((ByteStorePtr) e2idlist->uids, *uids, sizeof(Int4)*count);
623 
624         }
625         break;
626     default:
627         ErrPostEx(SEV_ERROR, 0, 0, "Invalid reply type from the server: %d", e2rp->choice);
628         count = -1;
629         break;
630 
631     }
632 
633     Entrez2ReplyFree(e2ry);
634     Entrez2RequestFree(e2rq);
635 
636     return count;
637 }
BEAccessionToGi(CharPtr string)638 static Int4 BEAccessionToGi (CharPtr string)
639 {
640     Char buffer[32];
641     CharPtr chptr;
642     Int2 version;
643     Int4 gi, index;
644     SeqIdPtr sip;
645     TextSeqIdPtr tsip;
646     PDBSeqIdPtr  psip;
647     long tmplong;
648     Boolean digit;
649 
650     for(chptr = string, digit = TRUE; *chptr != NULLB; chptr++) {
651         if(!IS_DIGIT(*chptr)) {
652             digit = FALSE;
653             break;
654         }
655     }
656 
657     if(digit) {
658         if((gi = atol(string)) > 0)
659             return gi;
660     }
661 
662     /* all letters in accesion should be upper */
663     string = Nlm_StringUpper(string);
664 
665     gi = 0;
666 
667     if((sip = ValNodeNew (NULL)) == NULL)
668         return -1;
669 
670     index = 0; version = 0;
671     while (*string != '\0' && index < 16) {
672         if (*string == '.')
673             break;
674         buffer[index] = *string;
675         string++;
676         index++;
677     }
678 
679     buffer[index] = '\0';
680     if (*string == '.' && *(string+1) != '\0') {
681         sscanf((string+1), "%ld", &tmplong);
682         version = (Int2) tmplong;
683     }
684 
685     if((tsip = TextSeqIdNew ()) == NULL)
686         return -1;
687 
688     tsip->accession = StringSave(buffer);
689     tsip->version = version;
690 
691     /* GenBank, EMBL, and DDBJ. */
692     sip->choice = SEQID_GENBANK;
693     sip->data.ptrvalue = (Pointer) tsip;
694     gi = ID1FindSeqId (sip);
695 
696     if (gi == 0) {
697         /* SwissProt. */
698         sip->choice = SEQID_SWISSPROT;
699         gi = ID1FindSeqId (sip);
700     } else {
701         goto retpoint;
702     }
703 
704     if (gi == 0) {
705         /* PIR */
706         sip->choice = SEQID_PIR;
707         gi = ID1FindSeqId (sip);
708     } else {
709         goto retpoint;
710     }
711 
712     if (gi == 0) {
713         /* PRF */
714         sip->choice = SEQID_PRF;
715         gi = ID1FindSeqId (sip);
716     } else {
717         goto retpoint;
718     }
719 
720     if (gi == 0) {
721         /* OTHER, probably 'ref' */
722         sip->choice = SEQID_OTHER;
723         gi = ID1FindSeqId (sip);
724     }
725 
726     if(gi != 0)
727         goto retpoint;
728 
729     /* OK. We failed to find gi using string as TextSeqId. Now trying
730        last time - with PDBSeqIdPtr */
731 
732     if((psip = PDBSeqIdNew()) == NULL)
733         return -1;
734 
735     sip->choice = SEQID_PDB;
736     sip->data.ptrvalue = psip;
737 
738     psip->mol = StringSave(buffer);
739     psip->chain = version;
740 
741     gi = ID1FindSeqId (sip);
742 
743     SeqIdFree(sip);
744 
745  retpoint:
746     TextSeqIdFree(tsip);
747     return gi;
748 }
749 
BE_AccToGi(CharPtr accession,Uint4 ** giptr,Int4 database)750 Int4 BE_AccToGi(CharPtr accession, Uint4 **giptr, Int4 database)
751 {
752     Int4 count;
753     Int4 gi;
754     Char tmp[512];
755 
756     /* Checking if this is gi number */
757 
758     if((gi = atol(accession)) != 0) {
759         *giptr = (Uint4 *)MemNew(sizeof(Uint4));
760         (*giptr)[0] = gi;
761         return 1;
762     }
763     sprintf(tmp, "%s[ACCN]", accession);
764 
765     count = BEGetUidsFromQuery(tmp, giptr,
766                                database == 0, /* Nucleotide ? */
767                                FALSE);
768 
769     return count > 0 ? count : 0;
770 }
771 
772 #define UID_BUFF_SIZE 2048
773 
BE_ReadIds(BEDataPtr pBdata,Uint4 ** ids_out)774 Int4 BE_ReadIds(BEDataPtr pBdata, Uint4 **ids_out)
775 {
776     Uint4 *uids, *giptr;
777     Int4 length, NumNotValid = 0, gi;
778     Int4 i, j, k, allocated, count = 0;
779     Char TmpBuff[16];
780     CharPtr buffer;
781 
782     if((buffer = pBdata->uids) == NULL || *buffer == NULLB) {
783         *ids_out = NULL;
784         return 0;
785     }
786 
787     length = StringLen(buffer);
788 
789     allocated = UID_BUFF_SIZE;
790     uids = (Uint4 *)MemNew(allocated * sizeof(Uint4));
791 
792     for(i = 0; i < length; i++) {
793 
794         if(isspace(buffer[i]) || buffer[i] == ',') /* Rolling spaces */
795             continue;
796 
797         /* This is defence from badly formatted requests */
798 
799         if(NumNotValid > 10) {
800             printf("**** ERROR: Too many invalid Gis/Accessions, "
801                    "parsing aborted\n");
802             *ids_out = NULL;
803             return 0;
804         }
805 
806         /* Rolling spaces */
807 
808         j= 0;
809         while (!isspace(buffer[i]) && j < 10  && i < length) {
810             TmpBuff[j] = buffer[i];
811             j++; i++;
812             if(buffer[i] == ',')  /* Comma is valid delimiter */
813                 break;
814         }
815         TmpBuff[j] = NULLB;
816 
817 
818         /* Ignore strings like ">Protein" */
819 
820         if(j > 0 && TmpBuff[0] == '>' && IS_ALPHA(TmpBuff[1]))
821             continue;
822 
823         /* Is gi/accession too long ??? */
824 
825         if(j == 10) {
826             NumNotValid++;
827 
828             while(!isspace(buffer[i]) ||
829                   buffer[i] == ',' ||
830                   buffer[i] == NULLB) /* Rolling until spaces */
831                 i++;
832             continue;  /* Next may be valid ... who knows...?? */
833         }
834 
835         /* Now validating accession/gi */
836 
837         for(k =0; k < j; k++) {
838             if(!isdigit(TmpBuff[k])) {
839                 break;
840             }
841         }
842         if(k != j) {
843             if(!IS_ntdb_accession(TmpBuff) && !IS_protdb_accession(TmpBuff)) {
844                 NumNotValid++;
845                 printf("**** WARNING: Gi/Accession \"%s\" is not valid\n",
846                        TmpBuff);
847                 continue;
848             }
849         }
850 
851         /* If this is valid Accession check and tranfer it to gi */
852 
853         giptr = NULL;
854 
855         if((gi = BEAccessionToGi (TmpBuff)) < 0) {
856             printf("**** WARNING: Gi/Accession %s is not found "
857                    "in database----\r\n", TmpBuff);
858             /* NumNotValid++; */
859             continue;
860         } else {
861             if(count == allocated) {
862                 allocated += UID_BUFF_SIZE;
863                 uids = (Uint4*)Realloc(uids, allocated * sizeof(Uint4));
864             }
865             uids[count] = gi;
866             count++;
867         }
868     }
869 
870     if(NumNotValid) {
871         printf("**** %d invalid Gi%s/Accession%s present in Entrez-batch "
872                "request\r\n",
873                (int)NumNotValid,
874                NumNotValid == 1 ? "" : "s",
875                NumNotValid == 1 ? "" : "s"
876                );
877     }
878 
879     *ids_out = uids;
880     return count;
881 }
BatchHead(VoidPtr pointer,FILE * fd)882 static void BatchHead(VoidPtr pointer, FILE *fd)
883 {
884   return;
885 }
BatchTail(VoidPtr pointer,FILE * fd)886 static void BatchTail(VoidPtr pointer, FILE *fd)
887 {
888   return;
889 }
890 
QSRV_Time(CharPtr string,Int4 len,time_t seconds)891 Boolean QSRV_Time(CharPtr string, Int4 len, time_t seconds)
892 {
893     if(string == NULL || len < 25)
894         return FALSE;
895 
896     if(!seconds) {
897         seconds = GetSecs();
898     }
899 
900 #if defined(OS_UNIX_IRIX) || defined(OS_UNIX_LINUX) || defined(OS_UNIX_OSF1) || defined(OS_UNIX_FREEBSD)
901     ctime_r(&seconds, string);
902 #elif defined(OS_UNIX_DARWIN) /* no ctime_r :-/ */
903     strncpy(string, ctime(&seconds), len - 1);
904     string[len - 1] = '\0';
905 #else
906     ctime_r(&seconds, string, len);
907 #endif
908 
909     string[24] = NULLB;
910     return TRUE;
911 }
912 
QSRVWriteLogInfo(BEDataPtr pBdata)913 void QSRVWriteLogInfo(BEDataPtr pBdata)
914 {
915     FILE *fd;
916     Char tmp[128];
917     Char timebuf[64];
918 
919     if(pBdata == NULL)
920         return;
921 
922     if((fd = FileOpen(QSRV_LOGFILE_NAME, "a")) == NULL) {
923         sprintf(tmp, "/tmp/%s", QSRV_LOGFILE_NAME);
924         if((fd = FileOpen(tmp, "a")) == NULL)
925             return;
926     }
927 
928     QSRV_Time(timebuf, sizeof(timebuf), 0);
929 
930     fprintf(fd,
931             "%s|db=%d|term=\"%s\"|format=%d|count=%d|%d|%d|%d|%d|%d|%d\n",
932             timebuf, pBdata->database,
933             pBdata->query == NULL? "(null)" : pBdata->query,
934             pBdata->format, pBdata->count, pBdata->allset,
935             pBdata->request_type, pBdata->html, pBdata->savetodisk,
936             pBdata->noheader, pBdata->commandline);
937 
938     fflush(fd);
939     FileClose(fd);
940 
941     return;
942 }
943 
Main(void)944 Int2 Main(void)
945 {
946     WWWInfoPtr info;
947     WWWErrorCode error;
948     BEDataPtr pBdata;
949     Int4 count;
950     Uint4 *ids;
951     Char tmp[512];
952 
953     if((error = WWWReadPosting(&info)) != WWWErrOk) {
954         ErrPostEx(SEV_FATAL, 88, 0, "Error in processing WWW request");
955         return 1;
956     }
957 
958     if(WWWGetMethod(info) == COMMAND_LINE) {
959         WWWInfoFree(info);
960 
961         if((pBdata = BEMakeCLParam()) == NULL)
962             return 1;
963 
964     } else {
965 
966         if(!ErrSetLogfile (QSRV_LOGFILE_STD_NAME, ELOG_APPEND))
967             return 1;
968 
969         if((pBdata = BEMakeWWWParam(info)) == NULL) {
970             printf("Content-type: text/html\n\n");
971             printf("QSRV_STATUS 802 Invalid input parameters <PRE><BR>\n");
972             fflush(stdout);
973 
974             ErrPostEx(SEV_ERROR, 88, 0,
975                       "Error in reading parameters. "
976                       "Please check, that query string was set\n");
977             return 1;
978         }
979     }
980 
981     if(!pBdata->commandline) {
982         if(pBdata->savetodisk)
983             printf("Content-type: chemical/seq-%s-genbank\n\n",
984                    pBdata->database == 0 ? "na" : "aa");
985         else {
986             printf("Content-type: text/html\n\n");
987             printf("<HTML><HEADER><TITLE>Batch Entrez results"
988                    "</TITLE></HEADER><PRE>\n");
989         }
990     }
991 
992     if (! ID1BioseqFetchEnable("Nbatch", TRUE)) {
993         printf("Cannot initialize ID1\n");
994         return 1;
995     }
996 
997     if(pBdata->html) {
998         init_www(); /* initializing WWW mode */
999         head_tail_ff(NULL, BatchHead, BatchTail);
1000 
1001     }
1002 
1003     SeqEntryLoad();
1004 
1005     switch(pBdata->request_type) {
1006     case REQ_DEFAULT:
1007         if(pBdata->uids == NULL) {
1008             count = BEGetUidsFromQuery(pBdata->query, &ids,
1009                                pBdata->database == 0, /* Nucleotide ? */
1010                                FALSE);
1011         } else {
1012             count = BE_ReadIds(pBdata, &ids);
1013         }
1014         break;
1015     case REQ_ADVANCED_QUERY:
1016     case REQ_ORGANISM:
1017         count = BEGetUidsFromQuery(pBdata->query, &ids,
1018                                    pBdata->database == 0, /* Nucleotide ? */
1019                                    FALSE);
1020         break;
1021     case REQ_LIST_OF_GIS:
1022         count = BE_ReadIds(pBdata, &ids);
1023         break;
1024     }
1025     pBdata->count = count;
1026 
1027     QSRVWriteLogInfo(pBdata);
1028 
1029     if(count < 0) {
1030         if(pBdata->commandline)
1031             ErrPostEx(SEV_ERROR, 0, count, "Error in searching the database");
1032         else
1033             printf("QSRV_STATUS %d Error in searching database\n", count);
1034         return 1;
1035     }
1036 
1037     if(count == 0) {
1038         if(pBdata->commandline)
1039             ErrPostEx(SEV_INFO, 0,0, "No entries found");
1040         else
1041             printf("QSRV_STATUS 901 OK No entries found\n", count);
1042         return 1;
1043     }
1044 
1045     if((count > 20000 && pBdata->format == F_GEN) ||
1046        (count > 100000 && pBdata->format != F_GILIST)) {
1047 
1048         if(pBdata->commandline)
1049             ErrPostEx(SEV_WARNING, 0,0, "Number of sequences %d exceed limit",
1050                       count);
1051         else
1052             printf("QSRV_STATUS 803 Error Number of sequences %d "
1053                    "exceed limit\n", count);
1054 
1055         pBdata->format = F_GILIST;
1056     }
1057 
1058     if(!pBdata->noheader && !pBdata->commandline)
1059         printf("QSRV_STATUS 900 OK: %d entr%s found <PRE><BR>\n",
1060                count, count == 1? "y" : "ies");
1061 
1062     fflush(stdout);
1063 
1064     /* Printing results */
1065 
1066     BEPrintIds(pBdata, ids, count);
1067 
1068     /* Clearing memory */
1069 
1070     MemFree(ids);
1071     BEFreeCLParam(pBdata);
1072 
1073     return 0;
1074 }
1075