1 /* $Id: qbatch.c,v 6.12 2005/10/13 13:53:51 kans Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: $RCSfile: qbatch.c,v $
27 *
28 * Author: Sergei Shavirin
29 *
30 * Version Creation Date: 05/04/2000
31 *
32 * $Revision: 6.12 $
33 *
34 * File Description:
35 * WWW and Command-line Batch Entrez using Entre2 and ID1
36 *
37 * $Log: qbatch.c,v $
38 * Revision 6.12 2005/10/13 13:53:51 kans
39 * commented out depracated EntrezSetServer function call
40 *
41 * Revision 6.11 2004/08/13 02:01:21 beloslyu
42 * Changes for FreeBSD
43 *
44 * Revision 6.10 2002/10/24 17:37:36 ucko
45 * Kludge around Darwin's lack of ctime_r.
46 *
47 * Revision 6.9 2001/03/19 16:18:17 beloslyu
48 * fix the args for ctime_r for OSF1 on alpha
49 *
50 * Revision 6.8 2000/08/25 21:02:15 shavirin
51 * Changed Flat file printing from DUMP_MODE to RELEASE_MODE.
52 *
53 * Revision 6.7 2000/06/21 14:34:35 beloslyu
54 * fix the args for ctime_r on linux
55 *
56 * Revision 6.6 2000/05/31 12:37:17 kans
57 * removed SwapUint4, which somehow got back into the code
58 *
59 * Revision 6.5 2000/05/30 16:20:28 kans
60 * removed cvs merge when incorrect endian code was removed
61 *
62 * Revision 6.3 2000/05/26 18:05:53 shavirin
63 * Added protection against big-little endians for uids from Entrez.
64 *
65 * Revision 6.2 2000/05/09 13:37:01 shavirin
66 * Use functions IS_ntdb_accession() and IS_protdb_accession() to
67 * verify accession number.
68 *
69 * Revision 6.1 2000/05/04 21:14:33 shavirin
70 * Initial revision.
71 *
72 *
73 * ==========================================================================
74 */
75
76 #include <ncbi.h>
77 #include <sequtil.h>
78 #include <asn2ff.h>
79 #include <tofasta.h>
80 #include <ffprint.h>
81 #include <ent2api.h>
82 #include <accid1.h>
83
84 #define DB_NUCLEOTIDE "Nucleotide"
85 #define DB_PROTEIN "Protein"
86 #define DB_MEDLINE "Medline"
87
88 #define NUMARGS (sizeof(BE_args)/sizeof(BE_args[0]))
89
90 Args BE_args[] = {
91 {"Database\n"
92 " 0 - Nucleotide \n"
93 " 1 - Protein",
94 "0", NULL, NULL, FALSE,'s',ARG_INT,0.0,0,NULL},
95 {"Format of output\n"
96 " 0 - GenBank/GenPept \n"
97 " 1 - FASTA \n"
98 " 2 - ASN.1 \n"
99 " 3 - List of GIs\n"
100 " 4 - List of Deflines\n"
101 " 5 - Single GenBank Bioseq-set ASN.1",
102 "0", NULL,NULL,FALSE,'f',ARG_INT, 0.0,0,NULL},
103 {"Number of returned entries from single gi/accession\n"
104 " 0 - Return single entry specified by gi/accession \n"
105 " 1 - Return all entries in SeqEntry, that available",
106 "0", NULL,NULL,FALSE,'n',ARG_INT,0.0,0,NULL},
107 {"Text or HTML? (For GenBank/GenPept)\n"
108 " 0 - Text output \n"
109 " 1 - HTML output",
110 "0", NULL,NULL,FALSE,'h',ARG_INT, 0.0,0,NULL},
111 {"Query string",
112 NULL, NULL,NULL,TRUE,'q',ARG_STRING, 0.0,0,NULL},
113 {"ID (accession or gi)",
114 NULL, NULL,NULL,TRUE,'u',ARG_STRING, 0.0,0,NULL},
115 {"File with list of GIS/Accessions",
116 "stdin", NULL,NULL,TRUE,'i',ARG_FILE_IN, 0.0,0,NULL},
117 {"Logfile name:",
118 "stdout", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL}
119 };
120
121 #define F_GEN 0
122 #define F_FASTA 1
123 #define F_ASN1 2
124 #define F_GILIST 3
125 #define F_DLIST 4
126 #define F_ASN1_GENB 5
127
128 #define REQ_DEFAULT 0
129 #define REQ_LIST_OF_GIS 1
130 #define REQ_ADVANCED_QUERY 2
131 #define REQ_ORGANISM 3
132
133 #define SQL_SRV_NAME "PUBSEQ_OS"
134
135 #define QSRV_LOGFILE_NAME "qserver.log"
136 #define QSRV_LOGFILE_STD_NAME "/tmp/qserver_std.log"
137
138 typedef VoidPtr QSRVHandlePtr;
139
140 static CharPtr BE_Dbname[] = {"Nucleotide", "Protein", "Medline"};
141
142 typedef struct BEData {
143 Int4 database;
144 CharPtr query;
145 CharPtr uids;
146 Int4 format;
147 Int4 allset;
148 Int4 request_type;
149 Boolean html;
150 Int4 savetodisk;
151 Int4 maxdocs;
152 Int4 noheader;
153 Int4 commandline;
154 Int4 count;
155 } BEData, PNTR BEDataPtr;
156
157 typedef struct BGenBank {
158 AsnIoPtr aip;
159 AsnTypePtr atp;
160 AsnTypePtr atp_bioseq_set_seq_set;
161 AsnTypePtr atp_bioseq_set;
162 } BGenBank, PNTR BGenBankPtr;
163
164 #define MACRO_atp_find(atp,name)\
165 if((atp = AsnTypeFind(amp, #name))==NULL){\
166 ErrPostEx(SEV_ERROR,0,0,\
167 "Could not find type <%s>", #name);\
168 return NULL; \
169 }
170
BESeqEntryGet(Int4 gi)171 SeqEntryPtr BESeqEntryGet(Int4 gi)
172 {
173 SeqEntryPtr sep = NULL;
174
175 if((sep = ID1SeqEntryGet(gi, 0)) == NULL) {
176 fprintf(stderr, "Sequence ID %d cannot be retrieved "
177 "from the database", gi);
178 }
179
180 return sep;
181 }
182
BGenBankInit(void)183 BGenBankPtr BGenBankInit(void)
184 {
185 BGenBankPtr bgbp;
186 AsnModulePtr amp;
187
188 AsnTypePtr atp_bioseq_set;
189 AsnTypePtr atp_bioseq_set_level;
190 AsnTypePtr atp_bioseq_set_class;
191 AsnTypePtr atp_bioseq_set_release;
192 AsnTypePtr atp_bioseq_set_date;
193 AsnTypePtr atp_bioseq_set_seq_set;
194 AsnTypePtr atp_bioseq_set_seq_set_E;
195
196 Char release[] = "Q-server Production";
197 Char date_time[128];
198 NCBI_Date date;
199 DataVal dv;
200
201 DateClean(&date);
202
203 bgbp = (BGenBankPtr)MemNew(sizeof(BGenBank));
204 bgbp->aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
205
206 amp = AsnAllModPtr();
207
208 MACRO_atp_find(atp_bioseq_set,Bioseq-set);
209 MACRO_atp_find(atp_bioseq_set_level,Bioseq-set.level);
210 MACRO_atp_find(atp_bioseq_set_class,Bioseq-set.class);
211 MACRO_atp_find(atp_bioseq_set_release,Bioseq-set.release);
212 MACRO_atp_find(atp_bioseq_set_date,Bioseq-set.date);
213 MACRO_atp_find(atp_bioseq_set_seq_set,Bioseq-set.seq-set);
214 MACRO_atp_find(atp_bioseq_set_seq_set_E,Bioseq-set.seq-set.E);
215
216 if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set,NULL))
217 return NULL;
218 dv.intvalue = 0;
219
220 if(!AsnWrite(bgbp->aip,atp_bioseq_set_level,&dv))
221 return NULL;
222 dv.intvalue = 7;
223
224 if(!AsnWrite(bgbp->aip,atp_bioseq_set_class,&dv))
225 return NULL;
226 dv.ptrvalue = &release;
227
228 if(!AsnWrite(bgbp->aip,atp_bioseq_set_release,&dv))
229 return NULL;
230
231 Nlm_DayTimeStr(date_time,TRUE,TRUE);
232 date.str=date_time;
233
234 if(!DateAsnWrite(&date,bgbp->aip,atp_bioseq_set_date))
235 return NULL;
236
237 if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set_seq_set,NULL))
238 return NULL;
239 AsnIoFlush(bgbp->aip);
240
241 bgbp->atp =atp_bioseq_set_seq_set_E;
242 bgbp->atp_bioseq_set_seq_set = atp_bioseq_set_seq_set;
243 bgbp->atp_bioseq_set = atp_bioseq_set;
244
245 return bgbp;
246 }
BGenBankClose(BGenBankPtr bgbp)247 void BGenBankClose(BGenBankPtr bgbp)
248 {
249 AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set_seq_set,NULL);
250 AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set,NULL);
251
252 AsnIoClose(bgbp->aip);
253 MemFree(bgbp);
254 return;
255 }
256
BEPrintIds(BEDataPtr pBdata,Uint4 * ids,int count)257 void BEPrintIds(BEDataPtr pBdata, Uint4 *ids, int count)
258 {
259 Int4 i;
260 SeqEntryPtr sep, sep_all;
261 Boolean retvalue = TRUE;
262 SeqIdPtr sip = NULL;
263 BioseqPtr bsp;
264 BGenBankPtr bgbp;
265 AsnIoPtr aip;
266 Boolean is_na = FALSE;
267
268 if(pBdata->format == F_GILIST) {
269 for(i = 0; i < count; i++)
270 fprintf(stdout, "%d\n", (int) ids[i]);
271 return;
272 }
273
274 if(pBdata->database == 0)
275 is_na = TRUE;
276
277 if(pBdata->format == F_ASN1_GENB)
278 bgbp = BGenBankInit();
279
280 for(i = 0; i < count; i++) {
281
282 sep_all = BESeqEntryGet(ids[i]);
283
284 if(sep_all == NULL) {
285 ErrPostEx(SEV_ERROR, 88, 67, "Retrieving of blob for the "
286 "gi=%d failed", (int)ids[i]);
287 continue;
288 }
289
290 if(!pBdata->allset) {
291 ObjMgrRegister(OBJ_SEQENTRY, sep_all);
292 sip = ValNodeNew(NULL);
293 sip->choice = SEQID_GI;
294 sip->data.intvalue = ids[i];
295
296 if((bsp = BioseqFind(sip)) == NULL) {
297 ErrPostEx(SEV_ERROR, 88, 67,
298 "Error finding bioseq for gi=%d\n", (int)ids[i]);
299 continue;
300 }
301
302 sep = SeqEntryNew();
303 sep->choice = 1; /* Bioseq */
304 sep->data.ptrvalue = bsp;
305 } else {
306 sep = sep_all;
307 }
308
309 switch(pBdata->format) {
310 case F_FASTA: /* 1 */
311
312 if(!SeqEntryToFasta(sep, stdout, is_na)) {
313 if(!SeqEntryToFasta(sep, stdout, !is_na)) {
314 ErrPostEx(SEV_ERROR, 88, 67, "Printing of FASTA format "
315 "(gi=%d) failed\r\n", (int)ids[i]);
316 }
317 }
318
319 break;
320 case F_ASN1: /* 2 */
321
322 aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
323 SeqEntryAsnWrite(sep, aip, NULL);
324 AsnIoClose(aip);
325
326 break;
327 case F_GILIST: /* 3 */
328 for(i = 0; i < count; i++)
329 fprintf(stdout, "%d\n", (int) ids[i]);
330 break;
331 case F_DLIST: /* 4 */
332 if (IS_Bioseq(sep))
333 retvalue = SeqEntrysToDefline(sep, stdout, is_na, 3);
334 else
335 retvalue = SeqEntrysToDefline(sep, stdout, is_na, 0);
336
337 if(retvalue == FALSE) {
338 if (IS_Bioseq(sep))
339 retvalue = SeqEntrysToDefline(sep, stdout, !is_na, 3);
340 else
341 retvalue = SeqEntrysToDefline(sep, stdout, !is_na, 0);
342 }
343 break;
344 case F_ASN1_GENB: /* 5 */
345 retvalue = SeqEntryAsnWrite(sep, bgbp->aip, bgbp->atp);
346 break;
347 default:
348 case F_GEN: /* 0 */
349 if(!SeqEntryToFlatEx(sep_all, stdout,
350 is_na ? GENBANK_FMT : GENPEPT_FMT,
351 RELEASE_MODE, sip, FF_REGULAR)) {
352
353 if(!SeqEntryToFlatEx(sep_all, stdout,
354 is_na ? GENPEPT_FMT : GENBANK_FMT,
355 RELEASE_MODE, sip, FF_REGULAR)) {
356 }
357 }
358 break;
359 }
360
361 SeqEntryFree(sep_all);
362 ValNodeFree(sip);
363 }
364
365 if(pBdata->format == F_ASN1_GENB)
366 BGenBankClose(bgbp);
367
368 return;
369 }
BEFreeCLParam(BEDataPtr pBdata)370 void BEFreeCLParam(BEDataPtr pBdata)
371 {
372 MemFree(pBdata->query);
373 MemFree(pBdata->uids);
374 MemFree(pBdata);
375
376 return;
377 }
BEMakeCLParam(void)378 BEDataPtr BEMakeCLParam(void)
379 {
380 BEDataPtr pBdata;
381 FILE *fd;
382
383 if (!GetArgs ("qserver", NUMARGS, BE_args))
384 return NULL;
385
386 if(!ErrSetLogfile (BE_args[7].strvalue, ELOG_APPEND))
387 exit(1);
388
389 pBdata = (BEDataPtr)MemNew(sizeof(BEData));
390
391 pBdata->database = BE_args[0].intvalue;
392 pBdata->format = BE_args[1].intvalue;
393 pBdata->allset = BE_args[2].intvalue;
394 pBdata->html = (Uchar) BE_args[3].intvalue;
395 pBdata->query = StringSave(BE_args[4].strvalue);
396
397 if(BE_args[5].strvalue != NULL)
398 pBdata->uids = StringSave(BE_args[5].strvalue);
399 else if(BE_args[4].strvalue == NULL) {
400 fd = FileOpen(BE_args[6].strvalue, "r");
401 pBdata->uids = WWWReadFileInMemory(fd, 0, FALSE);
402 FileClose(fd);
403 }
404
405 pBdata->commandline = TRUE;
406
407 if(pBdata->query == NULL && pBdata->uids == NULL) {
408 MemFree(pBdata);
409 ErrPostEx(SEV_ERROR, 88, 0,
410 "Error in reading parameters. "
411 "Please check, that query string was set\n");
412 return NULL;
413 }
414
415 return pBdata;
416 }
CleanCRLF(CharPtr query)417 static void CleanCRLF(CharPtr query)
418 {
419 CharPtr chptr;
420
421 if(query == NULL)
422 return;
423
424 for(chptr = query; *chptr != NULLB; chptr++) {
425 if(*chptr == '\n' || *chptr == '\r')
426 *chptr = ' ';
427 }
428 return;
429 }
430
BEMakeWWWParam(WWWInfoPtr info)431 BEDataPtr BEMakeWWWParam(WWWInfoPtr info)
432 {
433 BEDataPtr pBdata;
434 CharPtr chptr;
435 Char tmp[512];
436
437 pBdata = (BEDataPtr)MemNew(sizeof(BEData));
438
439 /* Database to search */
440
441 if((chptr = WWWGetValueByName(info, "DATABASE")) == NULL) {
442 if((chptr = WWWGetValueByName(info, "DB")) == NULL)
443 if((chptr = WWWGetValueByName(info, "DATALIB")) == NULL) {
444 chptr = "n"; /* Default to nucleotides */
445 }
446 }
447
448 switch(*chptr) {
449 case 'n':
450 pBdata->database = 0;
451 break;
452 case 'p':
453 pBdata->database = 1;
454 break;
455 default:
456 pBdata->database = 0;
457 break;
458 }
459
460 if((chptr = WWWGetValueByName(info, "REQUEST_TYPE")) != NULL) {
461 if(!StringICmp(chptr, "LIST_OF_GIS"))
462 pBdata->request_type = REQ_LIST_OF_GIS;
463 else if(!StringICmp(chptr, "ADVANCED_QUERY"))
464 pBdata->request_type = REQ_ADVANCED_QUERY;
465 else if(!StringICmp(chptr, "ORGANISM"))
466 pBdata->request_type = REQ_ORGANISM;
467 } else {
468 pBdata->request_type = REQ_DEFAULT;
469 }
470
471 switch(pBdata->request_type) {
472 case REQ_ADVANCED_QUERY:
473 case REQ_LIST_OF_GIS:
474 case REQ_DEFAULT:
475 /* Query string */
476
477 if((chptr = WWWGetValueByName(info, "TERM")) == NULL)
478 chptr = WWWGetValueByName(info, "QUERY");
479
480 if(chptr != NULL && *chptr != NULLB) {
481 pBdata->query = StringSave(chptr);
482 CleanCRLF(pBdata->query);
483 }
484
485 /* List of UIDs */
486
487 if((chptr = WWWGetValueByName(info, "UID")) != NULL &&
488 *chptr != NULLB) {
489 pBdata->uids = StringSave(chptr);
490 }
491 break;
492 case REQ_ORGANISM:
493 /* Query for organism retrieval */
494
495 if(((chptr = WWWGetValueByName(info, "ORGNAME")) != NULL &&
496 *chptr != NULLB) ||
497 ((chptr = WWWGetValueByName(info, "LIST_ORG")) != NULL &&
498 *chptr != NULLB && StringICmp (chptr, "(None)"))) {
499 sprintf(tmp, "%s[ORGN]", chptr);
500 pBdata->query = StringSave(tmp);
501 }
502 break;
503 }
504
505 /* Checking for data consistency */
506
507 if(pBdata->request_type == REQ_LIST_OF_GIS && pBdata->uids == NULL)
508 goto fail_return;
509 if((pBdata->request_type == REQ_ADVANCED_QUERY ||
510 pBdata->request_type == REQ_ORGANISM) && pBdata->query == NULL)
511 goto fail_return;
512 if(pBdata->query == NULL && pBdata->uids == NULL)
513 goto fail_return;
514
515 /* Format of output */
516
517 if((chptr = WWWGetValueByName(info, "FORMAT")) != NULL ||
518 (chptr = WWWGetValueByName(info, "DOPT")) != NULL) {
519 pBdata->format = atoi(chptr);
520 } else {
521 pBdata->format = F_GILIST;
522 }
523
524 /* HTML output */
525
526 if((chptr = WWWGetValueByName(info, "HTML")) != NULL) {
527 if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
528 !StringICmp(chptr, "0"))
529 pBdata->html = FALSE;
530 else
531 pBdata->html = TRUE;
532 }
533
534 /* Output type */
535
536 if((chptr = WWWGetValueByName(info, "SAVETO")) != NULL) {
537 if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
538 !StringICmp(chptr, "0"))
539 pBdata->savetodisk = FALSE;
540 else
541 pBdata->savetodisk = TRUE;
542 }
543
544
545 if((chptr = WWWGetValueByName(info, "ALLSET")) != NULL) {
546 if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
547 !StringICmp(chptr, "0"))
548 pBdata->allset = FALSE;
549 else
550 pBdata->allset = TRUE;
551 }
552
553 if((chptr = WWWGetValueByName(info, "MAXDOCS")) != NULL ||
554 (chptr = WWWGetValueByName(info, "DISPMAX")) != NULL)
555 pBdata->maxdocs = atol(chptr);
556
557 if((chptr = WWWGetValueByName(info, "NOHEADER")) != NULL) {
558 if(!StringICmp(chptr, "NO") || !StringICmp(chptr, "FALSE") ||
559 !StringICmp(chptr, "0"))
560 pBdata->noheader = FALSE;
561 else
562 pBdata->noheader = TRUE;
563 }
564
565 return pBdata;
566
567 fail_return:
568 MemFree(pBdata);
569 return NULL;
570 }
571
572 /* This function is interface to the Entrez2 engine. It may be used
573 to get list of gis corresponding to the Entrez Boolean string or
574 just number of such hits in the Entrez database */
575
BEGetUidsFromQuery(CharPtr query,Uint4Ptr PNTR uids,Boolean is_na,Boolean count_only)576 static Int4 BEGetUidsFromQuery(CharPtr query, Uint4Ptr PNTR uids,
577 Boolean is_na, Boolean count_only)
578 {
579 Entrez2ReplyPtr e2ry;
580 Entrez2RequestPtr e2rq;
581 E2ReplyPtr e2rp;
582 Int4 count = 0, i;
583 Entrez2BooleanReplyPtr e2br;
584 Entrez2IdListPtr e2idlist;
585
586 *uids = NULL;
587
588 EntrezSetProgramName ("BLAST API");
589 /* EntrezSetServer ("www.ncbi.nlm.nih.gov", 80,
590 "/entrez/utils/entrez2server.fcgi"); */
591
592 e2rq = EntrezCreateBooleanRequest (!count_only, FALSE,
593 is_na? "Nucleotide" : "Protein",
594 query, 0, 0, NULL, 0, 0);
595
596 e2ry = EntrezSynchronousQuery (e2rq);
597
598 if (e2ry == NULL) {
599 ErrPostEx(SEV_ERROR, 0, 0,
600 "NULL returned from EntrezSynchronousQuery()");
601 return -1;
602 }
603
604 if((e2rp = e2ry->reply) == NULL) {
605 ErrPostEx(SEV_ERROR, 0, 0, "Invalid ASN.1: E2ReplyPtr==NULL");
606 return -1;
607 }
608
609 switch(e2rp->choice) {
610
611 case E2Reply_error:
612 ErrPostEx(SEV_ERROR, 0, 0, (CharPtr) e2rp->data.ptrvalue);
613 count = -1;
614 break;
615 case E2Reply_eval_boolean:
616 e2br = (Entrez2BooleanReplyPtr) e2rp->data.ptrvalue;
617 count = e2br->count;
618 if((e2idlist = e2br->uids) != NULL) {
619 count = e2idlist->num;
620 *uids = MemNew(sizeof(Int4)*count);
621 BSSeek((ByteStorePtr) e2idlist->uids, 0, SEEK_SET);
622 BSRead((ByteStorePtr) e2idlist->uids, *uids, sizeof(Int4)*count);
623
624 }
625 break;
626 default:
627 ErrPostEx(SEV_ERROR, 0, 0, "Invalid reply type from the server: %d", e2rp->choice);
628 count = -1;
629 break;
630
631 }
632
633 Entrez2ReplyFree(e2ry);
634 Entrez2RequestFree(e2rq);
635
636 return count;
637 }
BEAccessionToGi(CharPtr string)638 static Int4 BEAccessionToGi (CharPtr string)
639 {
640 Char buffer[32];
641 CharPtr chptr;
642 Int2 version;
643 Int4 gi, index;
644 SeqIdPtr sip;
645 TextSeqIdPtr tsip;
646 PDBSeqIdPtr psip;
647 long tmplong;
648 Boolean digit;
649
650 for(chptr = string, digit = TRUE; *chptr != NULLB; chptr++) {
651 if(!IS_DIGIT(*chptr)) {
652 digit = FALSE;
653 break;
654 }
655 }
656
657 if(digit) {
658 if((gi = atol(string)) > 0)
659 return gi;
660 }
661
662 /* all letters in accesion should be upper */
663 string = Nlm_StringUpper(string);
664
665 gi = 0;
666
667 if((sip = ValNodeNew (NULL)) == NULL)
668 return -1;
669
670 index = 0; version = 0;
671 while (*string != '\0' && index < 16) {
672 if (*string == '.')
673 break;
674 buffer[index] = *string;
675 string++;
676 index++;
677 }
678
679 buffer[index] = '\0';
680 if (*string == '.' && *(string+1) != '\0') {
681 sscanf((string+1), "%ld", &tmplong);
682 version = (Int2) tmplong;
683 }
684
685 if((tsip = TextSeqIdNew ()) == NULL)
686 return -1;
687
688 tsip->accession = StringSave(buffer);
689 tsip->version = version;
690
691 /* GenBank, EMBL, and DDBJ. */
692 sip->choice = SEQID_GENBANK;
693 sip->data.ptrvalue = (Pointer) tsip;
694 gi = ID1FindSeqId (sip);
695
696 if (gi == 0) {
697 /* SwissProt. */
698 sip->choice = SEQID_SWISSPROT;
699 gi = ID1FindSeqId (sip);
700 } else {
701 goto retpoint;
702 }
703
704 if (gi == 0) {
705 /* PIR */
706 sip->choice = SEQID_PIR;
707 gi = ID1FindSeqId (sip);
708 } else {
709 goto retpoint;
710 }
711
712 if (gi == 0) {
713 /* PRF */
714 sip->choice = SEQID_PRF;
715 gi = ID1FindSeqId (sip);
716 } else {
717 goto retpoint;
718 }
719
720 if (gi == 0) {
721 /* OTHER, probably 'ref' */
722 sip->choice = SEQID_OTHER;
723 gi = ID1FindSeqId (sip);
724 }
725
726 if(gi != 0)
727 goto retpoint;
728
729 /* OK. We failed to find gi using string as TextSeqId. Now trying
730 last time - with PDBSeqIdPtr */
731
732 if((psip = PDBSeqIdNew()) == NULL)
733 return -1;
734
735 sip->choice = SEQID_PDB;
736 sip->data.ptrvalue = psip;
737
738 psip->mol = StringSave(buffer);
739 psip->chain = version;
740
741 gi = ID1FindSeqId (sip);
742
743 SeqIdFree(sip);
744
745 retpoint:
746 TextSeqIdFree(tsip);
747 return gi;
748 }
749
BE_AccToGi(CharPtr accession,Uint4 ** giptr,Int4 database)750 Int4 BE_AccToGi(CharPtr accession, Uint4 **giptr, Int4 database)
751 {
752 Int4 count;
753 Int4 gi;
754 Char tmp[512];
755
756 /* Checking if this is gi number */
757
758 if((gi = atol(accession)) != 0) {
759 *giptr = (Uint4 *)MemNew(sizeof(Uint4));
760 (*giptr)[0] = gi;
761 return 1;
762 }
763 sprintf(tmp, "%s[ACCN]", accession);
764
765 count = BEGetUidsFromQuery(tmp, giptr,
766 database == 0, /* Nucleotide ? */
767 FALSE);
768
769 return count > 0 ? count : 0;
770 }
771
772 #define UID_BUFF_SIZE 2048
773
BE_ReadIds(BEDataPtr pBdata,Uint4 ** ids_out)774 Int4 BE_ReadIds(BEDataPtr pBdata, Uint4 **ids_out)
775 {
776 Uint4 *uids, *giptr;
777 Int4 length, NumNotValid = 0, gi;
778 Int4 i, j, k, allocated, count = 0;
779 Char TmpBuff[16];
780 CharPtr buffer;
781
782 if((buffer = pBdata->uids) == NULL || *buffer == NULLB) {
783 *ids_out = NULL;
784 return 0;
785 }
786
787 length = StringLen(buffer);
788
789 allocated = UID_BUFF_SIZE;
790 uids = (Uint4 *)MemNew(allocated * sizeof(Uint4));
791
792 for(i = 0; i < length; i++) {
793
794 if(isspace(buffer[i]) || buffer[i] == ',') /* Rolling spaces */
795 continue;
796
797 /* This is defence from badly formatted requests */
798
799 if(NumNotValid > 10) {
800 printf("**** ERROR: Too many invalid Gis/Accessions, "
801 "parsing aborted\n");
802 *ids_out = NULL;
803 return 0;
804 }
805
806 /* Rolling spaces */
807
808 j= 0;
809 while (!isspace(buffer[i]) && j < 10 && i < length) {
810 TmpBuff[j] = buffer[i];
811 j++; i++;
812 if(buffer[i] == ',') /* Comma is valid delimiter */
813 break;
814 }
815 TmpBuff[j] = NULLB;
816
817
818 /* Ignore strings like ">Protein" */
819
820 if(j > 0 && TmpBuff[0] == '>' && IS_ALPHA(TmpBuff[1]))
821 continue;
822
823 /* Is gi/accession too long ??? */
824
825 if(j == 10) {
826 NumNotValid++;
827
828 while(!isspace(buffer[i]) ||
829 buffer[i] == ',' ||
830 buffer[i] == NULLB) /* Rolling until spaces */
831 i++;
832 continue; /* Next may be valid ... who knows...?? */
833 }
834
835 /* Now validating accession/gi */
836
837 for(k =0; k < j; k++) {
838 if(!isdigit(TmpBuff[k])) {
839 break;
840 }
841 }
842 if(k != j) {
843 if(!IS_ntdb_accession(TmpBuff) && !IS_protdb_accession(TmpBuff)) {
844 NumNotValid++;
845 printf("**** WARNING: Gi/Accession \"%s\" is not valid\n",
846 TmpBuff);
847 continue;
848 }
849 }
850
851 /* If this is valid Accession check and tranfer it to gi */
852
853 giptr = NULL;
854
855 if((gi = BEAccessionToGi (TmpBuff)) < 0) {
856 printf("**** WARNING: Gi/Accession %s is not found "
857 "in database----\r\n", TmpBuff);
858 /* NumNotValid++; */
859 continue;
860 } else {
861 if(count == allocated) {
862 allocated += UID_BUFF_SIZE;
863 uids = (Uint4*)Realloc(uids, allocated * sizeof(Uint4));
864 }
865 uids[count] = gi;
866 count++;
867 }
868 }
869
870 if(NumNotValid) {
871 printf("**** %d invalid Gi%s/Accession%s present in Entrez-batch "
872 "request\r\n",
873 (int)NumNotValid,
874 NumNotValid == 1 ? "" : "s",
875 NumNotValid == 1 ? "" : "s"
876 );
877 }
878
879 *ids_out = uids;
880 return count;
881 }
BatchHead(VoidPtr pointer,FILE * fd)882 static void BatchHead(VoidPtr pointer, FILE *fd)
883 {
884 return;
885 }
BatchTail(VoidPtr pointer,FILE * fd)886 static void BatchTail(VoidPtr pointer, FILE *fd)
887 {
888 return;
889 }
890
QSRV_Time(CharPtr string,Int4 len,time_t seconds)891 Boolean QSRV_Time(CharPtr string, Int4 len, time_t seconds)
892 {
893 if(string == NULL || len < 25)
894 return FALSE;
895
896 if(!seconds) {
897 seconds = GetSecs();
898 }
899
900 #if defined(OS_UNIX_IRIX) || defined(OS_UNIX_LINUX) || defined(OS_UNIX_OSF1) || defined(OS_UNIX_FREEBSD)
901 ctime_r(&seconds, string);
902 #elif defined(OS_UNIX_DARWIN) /* no ctime_r :-/ */
903 strncpy(string, ctime(&seconds), len - 1);
904 string[len - 1] = '\0';
905 #else
906 ctime_r(&seconds, string, len);
907 #endif
908
909 string[24] = NULLB;
910 return TRUE;
911 }
912
QSRVWriteLogInfo(BEDataPtr pBdata)913 void QSRVWriteLogInfo(BEDataPtr pBdata)
914 {
915 FILE *fd;
916 Char tmp[128];
917 Char timebuf[64];
918
919 if(pBdata == NULL)
920 return;
921
922 if((fd = FileOpen(QSRV_LOGFILE_NAME, "a")) == NULL) {
923 sprintf(tmp, "/tmp/%s", QSRV_LOGFILE_NAME);
924 if((fd = FileOpen(tmp, "a")) == NULL)
925 return;
926 }
927
928 QSRV_Time(timebuf, sizeof(timebuf), 0);
929
930 fprintf(fd,
931 "%s|db=%d|term=\"%s\"|format=%d|count=%d|%d|%d|%d|%d|%d|%d\n",
932 timebuf, pBdata->database,
933 pBdata->query == NULL? "(null)" : pBdata->query,
934 pBdata->format, pBdata->count, pBdata->allset,
935 pBdata->request_type, pBdata->html, pBdata->savetodisk,
936 pBdata->noheader, pBdata->commandline);
937
938 fflush(fd);
939 FileClose(fd);
940
941 return;
942 }
943
Main(void)944 Int2 Main(void)
945 {
946 WWWInfoPtr info;
947 WWWErrorCode error;
948 BEDataPtr pBdata;
949 Int4 count;
950 Uint4 *ids;
951 Char tmp[512];
952
953 if((error = WWWReadPosting(&info)) != WWWErrOk) {
954 ErrPostEx(SEV_FATAL, 88, 0, "Error in processing WWW request");
955 return 1;
956 }
957
958 if(WWWGetMethod(info) == COMMAND_LINE) {
959 WWWInfoFree(info);
960
961 if((pBdata = BEMakeCLParam()) == NULL)
962 return 1;
963
964 } else {
965
966 if(!ErrSetLogfile (QSRV_LOGFILE_STD_NAME, ELOG_APPEND))
967 return 1;
968
969 if((pBdata = BEMakeWWWParam(info)) == NULL) {
970 printf("Content-type: text/html\n\n");
971 printf("QSRV_STATUS 802 Invalid input parameters <PRE><BR>\n");
972 fflush(stdout);
973
974 ErrPostEx(SEV_ERROR, 88, 0,
975 "Error in reading parameters. "
976 "Please check, that query string was set\n");
977 return 1;
978 }
979 }
980
981 if(!pBdata->commandline) {
982 if(pBdata->savetodisk)
983 printf("Content-type: chemical/seq-%s-genbank\n\n",
984 pBdata->database == 0 ? "na" : "aa");
985 else {
986 printf("Content-type: text/html\n\n");
987 printf("<HTML><HEADER><TITLE>Batch Entrez results"
988 "</TITLE></HEADER><PRE>\n");
989 }
990 }
991
992 if (! ID1BioseqFetchEnable("Nbatch", TRUE)) {
993 printf("Cannot initialize ID1\n");
994 return 1;
995 }
996
997 if(pBdata->html) {
998 init_www(); /* initializing WWW mode */
999 head_tail_ff(NULL, BatchHead, BatchTail);
1000
1001 }
1002
1003 SeqEntryLoad();
1004
1005 switch(pBdata->request_type) {
1006 case REQ_DEFAULT:
1007 if(pBdata->uids == NULL) {
1008 count = BEGetUidsFromQuery(pBdata->query, &ids,
1009 pBdata->database == 0, /* Nucleotide ? */
1010 FALSE);
1011 } else {
1012 count = BE_ReadIds(pBdata, &ids);
1013 }
1014 break;
1015 case REQ_ADVANCED_QUERY:
1016 case REQ_ORGANISM:
1017 count = BEGetUidsFromQuery(pBdata->query, &ids,
1018 pBdata->database == 0, /* Nucleotide ? */
1019 FALSE);
1020 break;
1021 case REQ_LIST_OF_GIS:
1022 count = BE_ReadIds(pBdata, &ids);
1023 break;
1024 }
1025 pBdata->count = count;
1026
1027 QSRVWriteLogInfo(pBdata);
1028
1029 if(count < 0) {
1030 if(pBdata->commandline)
1031 ErrPostEx(SEV_ERROR, 0, count, "Error in searching the database");
1032 else
1033 printf("QSRV_STATUS %d Error in searching database\n", count);
1034 return 1;
1035 }
1036
1037 if(count == 0) {
1038 if(pBdata->commandline)
1039 ErrPostEx(SEV_INFO, 0,0, "No entries found");
1040 else
1041 printf("QSRV_STATUS 901 OK No entries found\n", count);
1042 return 1;
1043 }
1044
1045 if((count > 20000 && pBdata->format == F_GEN) ||
1046 (count > 100000 && pBdata->format != F_GILIST)) {
1047
1048 if(pBdata->commandline)
1049 ErrPostEx(SEV_WARNING, 0,0, "Number of sequences %d exceed limit",
1050 count);
1051 else
1052 printf("QSRV_STATUS 803 Error Number of sequences %d "
1053 "exceed limit\n", count);
1054
1055 pBdata->format = F_GILIST;
1056 }
1057
1058 if(!pBdata->noheader && !pBdata->commandline)
1059 printf("QSRV_STATUS 900 OK: %d entr%s found <PRE><BR>\n",
1060 count, count == 1? "y" : "ies");
1061
1062 fflush(stdout);
1063
1064 /* Printing results */
1065
1066 BEPrintIds(pBdata, ids, count);
1067
1068 /* Clearing memory */
1069
1070 MemFree(ids);
1071 BEFreeCLParam(pBdata);
1072
1073 return 0;
1074 }
1075