1 /* $Id: batch.c,v 6.19 2000/08/30 16:44:20 vakatov Exp $
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government have not placed any restriction on its use or reproduction.
13 *
14 * Although all reasonable efforts have been taken to ensure the accuracy
15 * and reliability of the software and data, the NLM and the U.S.
16 * Government do not and cannot warrant the performance or results that
17 * may be obtained by using this software or data. The NLM and the U.S.
18 * Government disclaim all warranties, express or implied, including
19 * warranties of performance, merchantability or fitness for any particular
20 * purpose.
21 *
22 * Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name: $RCSfile: batch.c,v $
27 *
28 * Author: Sergei Shavirin
29 *
30 * Version Creation Date: 12/16/1996
31 *
32 * $Revision: 6.19 $
33 *
34 * File Description:
35 * Main file for WWW and Command Line BatchEntrez programs
36 *
37 * $Log: batch.c,v $
38 * Revision 6.19 2000/08/30 16:44:20 vakatov
39 * Fixed printf() format mismatch
40 *
41 * Revision 6.18 2000/05/09 13:37:01 shavirin
42 * Use functions IS_ntdb_accession() and IS_protdb_accession() to
43 * verify accession number.
44 *
45 * Revision 6.17 2000/02/03 21:00:38 beloslyu
46 * fix the NCBI_Date initialization
47 *
48 * Revision 6.16 1999/10/21 21:10:04 shavirin
49 * Added possibility to retrive RefSeq accessions.
50 *
51 * Revision 6.15 1999/02/24 16:49:23 kans
52 * use accutils copy of IS_ntdb_accession and IS_protdb_accession
53 *
54 * Revision 6.14 1998/12/15 17:56:05 vakatov
55 * Fixed a tyny C++ compilation bug
56 *
57 * Revision 6.13 1998/07/07 13:43:41 shavirin
58 * Fixed warning of tough compiler setting.
59 *
60 * Revision 6.12 1998/05/19 21:54:05 shavirin
61 * Fixed function, that prints Batch Entrez WWW page
62 *
63 * Revision 6.11 1998/05/08 15:51:30 vakatov
64 * fixed UMR and a tiny typo; cleaned up some code
65 *
66 * Revision 6.10 1998/05/01 17:57:47 shavirin
67 * New revision
68 *
69 * Revision 6.9 1998/04/17 20:53:50 shavirin
70 * Check for accession format was made more "relaxed".
71 *
72 * Revision 6.8 1998/03/26 21:08:42 shavirin
73 * Changed exit(1) -> return 1 in Main() function.
74 *
75 * Revision 6.7 1997/12/10 18:00:24 shavirin
76 * Removed limits on number of gis to retrieve from command line mode
77 *
78 * Revision 6.5 1997/12/09 16:13:44 shavirin
79 * Removed message in ASN1_GENB case
80 *
81 * Revision 6.4 1997/12/01 20:09:56 shavirin
82 * Removed message in front of ASN1 outputs
83 *
84 * Revision 6.3 1997/11/26 21:57:13 shavirin
85 * Added format 5 - Single GenBank Bioseq-set
86 *
87 * Revision 6.2 1997/11/03 20:48:42 shavirin
88 * Added workaround for API bug with single gi retrieval
89 *
90 * Revision 6.1 1997/09/10 14:05:34 shavirin
91 * Added AE- type of accesssions handling
92 *
93 * Revision 6.0 1997/08/25 18:19:05 madden
94 * Revision changed to 6.0
95 *
96 * Revision 1.17 1997/07/23 19:24:32 shavirin
97 * Changed default background to white
98 *
99 * Revision 1.16 1997/07/22 18:57:30 shavirin
100 * Removed any limits for number of retrieved entryes
101 * if program used from command line
102 *
103 * Revision 1.15 1997/07/21 15:03:18 shavirin
104 * Now strings like ">Protein" will be ignored
105 *
106 * Revision 1.14 1997/07/03 16:23:17 shavirin
107 * Added ability to retrieve few gis from single accession
108 *
109 * Revision 1.13 1997/06/27 18:32:59 shavirin
110 * Added AF- style nucleotide accessions to be accepted
111 *
112 * Revision 1.12 1997/05/14 19:14:31 shavirin
113 * Added #define LF 10
114 *
115 * Revision 1.11 1997/04/25 04:25:21 shavirin
116 * Few fixes due to usage of the program through proxy and small
117 * bug with reading from file
118 *
119 * Revision 1.10 1997/04/09 19:29:24 shavirin
120 * Included ability to retrieve Protein accessions
121 *
122 * Revision 1.9 1997/03/28 18:23:13 shavirin
123 * Use PubMed accession index instead of SeqId index. Removed "www,www3"
124 * references for better proxying.
125 *
126 * Revision 1.8 1997/03/14 15:38:38 shavirin
127 * Removed difference between capital and small characters for
128 * accesssion number checkup.
129 *
130 * Revision 1.7 1997/03/13 16:15:52 shavirin
131 * Added new option for WWW Batch Entrez to retrieve single entry
132 * or complete set.
133 *
134 * Revision 1.6 1997/03/12 22:47:41 shavirin
135 * Added option to return only one entry from one entry
136 *
137 * Revision 1.5 1997/03/04 17:19:22 shavirin
138 * Fixed parser for long invalid accessions and added comma as
139 * valid delimiter of accesssions/gis
140 *
141 * Revision 1.4 1997/01/23 19:02:37 shavirin
142 * Removed creation of spurious logfiles in command-line mode
143 *
144 * Revision 1.3 1996/12/17 17:27:18 shavirin
145 * Function WWWSendBatchPage() changed to static
146 *
147 * Revision 1.2 1996/12/16 19:55:35 shavirin
148 * Changed file description.
149 *
150 * Revision 1.1 1996/12/16 19:51:37 shavirin
151 * Initial revision
152 *
153 *
154 * ==========================================================================
155 */
156
157 #define LogFile "wwwbatch.log"
158
159 #include <ncbi.h>
160 #include <ffprint.h>
161 #include <accentr.h>
162 #include <accutils.h>
163 #include <tofasta.h>
164 #include <asn2ff.h>
165 #include <ncbiwww.h>
166
167 #define LF 10
168
169 typedef struct BatchAccList {
170 CharPtr acc;
171 Int4 gi;
172 struct BatchAccList *next;
173 } BatchAccList, PNTR BatchAccListPtr;
174
175 typedef struct BatchParam {
176 Int4 format;
177 Boolean dump;
178 Int4 single_entry;
179 Int4 request;
180 Int4 sequence;
181 CharPtr organism;
182 CharPtr file;
183 Boolean html;
184 Boolean id_lookup;
185 Boolean CommandLine;
186 } BatchParam, PNTR BatchParamPtr;
187
188 typedef struct BGenBank {
189 AsnIoPtr aip;
190 AsnTypePtr atp;
191 AsnTypePtr atp_bioseq_set_seq_set;
192 AsnTypePtr atp_bioseq_set;
193 } BGenBank, PNTR BGenBankPtr;
194
195 #define REQ_ORG 0
196 #define REQ_LIST 1
197
198 #define F_GEN 0
199 #define F_FASTA 1
200 #define F_ASN1 2
201 #define F_GILIST 3
202 #define F_DLIST 4
203 #define F_ASN1_GENB 5
204
205 #define BSEQ_NA TYP_NT
206 #define BSEQ_AA TYP_AA
207
208 #define SEARCH_LIMIT 20000
209 #define SEARCH_DLIMIT 70000
210
211 static void WWWSendBatchPage(Int4 which);
212
213 static Int4 AccessionToGi(CharPtr string, Int4Ptr PNTR giptr, Int2 seqtype);
214 static BatchAccListPtr GetAccList(BatchParamPtr batchP, Int4Ptr total);
215
216 static Boolean PrintGi(Int4 gi, Int4 format,
217 FILE *fd, Int4 seq_type,
218 Int4 single_entry, BGenBankPtr bgbp);
219
220 static BatchParamPtr MakeBatchParameters(WWWEntryPtr PNTR entries, Int4 m);
221 static ByteStorePtr GetGisFromOrg(CharPtr org,
222 Int4Ptr GiNum, Int4 seqtype);
223
224 extern Boolean SeqEntrysToDefline(SeqEntryPtr sep,
225 FILE *fp, Boolean is_na, Uint1 group_segs);
226 static BatchParamPtr MakeCommandLineParameters(void);
227
228 void BatchHead(VoidPtr pointer, FILE *fd);
229 void BatchTail(VoidPtr pointer, FILE *fd);
230
231 static CharPtr organism[] = { "(None)",
232 "Arabidopsis thaliana",
233 "Bacillus subtilis",
234 "Bos taurus",
235 "Caenorhabditis elegans",
236 "Dictyostelium discoideum",
237 "Drosophila melanogaster",
238 "Escherichia coli",
239 "Gallus gallus",
240 "Homo sapiens",
241 "Human immunodeficiency virus type 1",
242 "Mus musculus",
243 "Oryctolagus cuniculus",
244 "Oryza sativa",
245 "Ovis aries",
246 "Rattus norvegicus",
247 "Saccharomyces cerevisiae",
248 "Schizosaccharomyces pombe",
249 "Simian immunodeficiency virus",
250 "Xenopus laevis",
251 "Zea mays",
252 NULL
253 };
254
255
256 #define MACRO_atp_find(atp,name)\
257 if((atp = AsnTypeFind(amp, #name))==NULL){\
258 ErrPostEx(SEV_ERROR,0,0,\
259 "Could not find type <%s>", #name);\
260 return NULL; \
261 }
262
263
264 #define BatchTitle "<A HREF=\"/htbin-post/PubMed/imagemap/EntrezBatch/batch.map\"><IMG SRC=\"/EntrezBatch/batch.gif\" BORDER=0 ISMAP HEIGHT=22 WIDTH=500></A>"
265
266 #define BatchResultsTitle "<A HREF=\"/htbin-post/PubMed/imagemap/EntrezBatch/batch.map\"><IMG SRC=\"/EntrezBatch/batch_results.gif\" BORDER=0 ISMAP HEIGHT=22 WIDTH=500></A>"
267 /************************************************************************
268 *
269 * void WWWSendPage(Int4 which) - function to draw entry table for
270 * the WWW Blast program
271 *
272 ***********************************************************************/
273
WWWSendBatchPage(Int4 which)274 static void WWWSendBatchPage(Int4 which) {
275 register Int4 i;
276
277 printf("HTTP/1.0 200 OK\r\n");
278 printf("MIME-Version: 1.0\r\n");
279
280 printf("Content-type: text/html\r\n\r\n");
281 printf("<HTML>\n<HEAD>\n");
282 printf("<TITLE>Batch Entrez</TITLE>\n</HEAD>\n");
283
284 printf("<BODY bgcolor=\"#FFFFFF\" text=\"#000000\" "
285 "link=\"#0000f0\" vlink=\"#6000b0\" alink=\"#f00000\">\n");
286
287 printf("%s%c", BatchTitle, LF);
288
289 printf("<FORM ACTION=\"http://%s:%s%s/result\" METHOD=POST "
290 "NAME=\"BATCH\" %s >%c",
291 getenv("SERVER_NAME") != NULL ? getenv("SERVER_NAME") : "NOT_SET",
292 getenv("SERVER_PORT") != NULL ? getenv("SERVER_PORT") : "NOT_SET",
293 getenv("SCRIPT_NAME") != NULL ? getenv("SCRIPT_NAME") : "NOT_SET",
294 which ? "ENCTYPE=\"multipart/form-data\" " : "", LF);
295
296 printf("<BR> This page is designed "
297 "to allow you to download (receive) a large "
298 "number of sequences from Entrez, in a batch mode. "
299 "The results of the search will be saved to a local "
300 "disk file on your machine. Upon submitting your query, "
301 "you will be prompted to provide the filename "
302 "where the results will be stored. Please make sure that "
303 "you have enough disk space on your computer before "
304 "submitting this request.\n%c", LF);
305
306 printf("<BR><BR><B>Choose type of sequences "
307 "to search and format of output: </B><BR><BR>");
308
309 printf("Sequence type: %c",LF);
310
311 printf("<select name = SEQ_TYPE>");
312 printf("<option> Nucleotide ");
313 printf("<option> Protein ");
314 printf("</select>");
315
316 printf(" "
317 "<INPUT TYPE=\"checkbox\" NAME=\"RETURN_ALL_SET\" "
318 "VALUE=YES> "
319 "Include all records within a segmented set");
320
321 printf("<BR>");
322 #ifdef NOT_SAVE
323 printf("<INPUT TYPE=\"checkbox\" NAME=\"DUMP_TYPE\" "
324 "VALUE=FILE></B> Save results to file");
325 #else
326 printf("<INPUT TYPE=\"hidden\" NAME=\"DUMP_TYPE\" "
327 "VALUE=FILE >");
328 #endif
329
330 printf("</B>Format: "
331 " %c",LF);
332
333 printf("<select name = FORMAT>");
334 printf("<option> GenBank/GenPept");
335 printf("<option> FASTA");
336 printf("<option> ASN.1");
337 printf("<option> List of GIs");
338 printf("<option> List of Deflines");
339 printf("</select>");
340
341 printf("<INPUT TYPE=\"checkbox\" NAME=\"HTML\" "
342 "VALUE=HTML CHECKED> HTML");
343
344 printf("<HR>");
345
346 printf("<INPUT TYPE=\"radio\" NAME=\"REQUEST_TYPE\" "
347 "VALUE=ORGANISM CHECKED> <B>Retrieve all "
348 "sequences for a specific organism.</B>");
349
350 printf("<BR> "
351 "Enter organism name here"
352 /* "(use full scientific name or common name)" */
353 "</B> "
354 "<INPUT TYPE=\"text\" NAME=\"ORGNAME\" "
355 "VALUE=\"\" MAXLENGTH=\"50\">");
356
357 printf("<BR> "
358 "Or choose it from list: <select name=LIST_ORG>");
359 for(i=0; organism[i] != NULL; i++)
360 printf("<option> %s ", organism[i]);
361 printf("</select>");
362 printf("<BR> <A HREF="
363 "\"/Taxonomy/tax.html\">"
364 "Explore the taxonomy database at NCBI</A>");
365
366 printf("<HR>");
367 printf("<INPUT TYPE=\"radio\" NAME=\"REQUEST_TYPE\" "
368 "VALUE=FILESUBMIT > <B>Retrieve all sequences from a "
369 "%s of Gis/Accessions</b>",
370 which? "file" : "list");
371
372 if(!which) {
373
374 printf("<BR>Enter gis/accessions here "
375 "(delimited by spaces or newlines)<BR> ");
376 printf("<textarea name=\"USERFILE\" rows=6 cols=60>"
377 "</textarea>%c", LF);
378 } else {
379 printf("<BR> ");
380 printf("Enter filename here "
381 "<INPUT TYPE=\"file\" NAME=\"USERFILE\" "
382 "onFocus=\"window.status='Press radio button to "
383 "activate this search type'; return true;\" ");
384 }
385
386 printf("<HR>");
387 printf("<BR><INPUT TYPE=\"submit\">%c", LF);
388 printf("<INPUT TYPE=\"reset\" VALUE=\"Clear input\">%c", LF);
389 printf("</FORM>%c", LF);
390 printf("<HR>%c", LF);
391
392 printf("<ADDRESS>");
393 printf("Comments and suggestions to:"
394 "< <a href=\"mailto:info@ncbi.nlm.nih.gov\">"
395 "info@ncbi.nlm.nih.gov"
396 "</a> > <BR> Credits to: "
397 "<a href=\"mailto:shavirin@ncbi.nlm.nih.gov\">"
398 "Sergei B. Shavirin</a>\n"
399 "<!-- <a href=\"http://www.ncbi.nlm.nih.gov/STS/shavirin.html\">"
400 "Sergei B. Shavirin</a> -->"
401 "<BR>Acknowledgements to: "
402 "<a href=\"mailto:epstein@ncbi.nlm.nih.gov\">"
403 "Jonathan Epstein</a>");
404 printf("</ADDRESS>%c", LF);
405
406 } /* WWWSendBatchPage() */
407
408 /************************************************************************
409 *
410 * Int2 Main() - main function for the WWW BatchEntrez search program
411 *
412 ***********************************************************************/
413 #define NUMARGS 8
414
415 Args dump_args[NUMARGS] = {
416 {"Sequence type\n"
417 " 0 - Nucleotide \n"
418 " 1 - Protein",
419 NULL, NULL,NULL,FALSE,'s',ARG_INT,0.0,0,NULL},
420 {"Format of output\n"
421 " 0 - GenBank/GenPept \n"
422 " 1 - FASTA \n"
423 " 2 - ASN.1 \n"
424 " 3 - List of GIs\n"
425 " 4 - List of Deflines\n"
426 " 5 - Single GenBank Bioseq-set ASN.1",
427 "0", NULL,NULL,FALSE,'f',ARG_INT, 0.0,0,NULL},
428 {"Number of returned entries from single gi/accession\n"
429 " 0 - Return all entries in SeqEntry, that available \n"
430 " 1 - Return single entry specified by gi/accession",
431 "0", NULL,NULL,FALSE,'n',ARG_INT,0.0,0,NULL},
432 {"Text or HTML? (For GenBank/GenPept)\n"
433 " 0 - Text output \n"
434 " 1 - HTML output",
435 "0", NULL,NULL,FALSE,'h',ARG_INT, 0.0,0,NULL},
436 { "File with list of GIS/Accessions",
437 "stdin", NULL, NULL, TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
438 {"Organism name (for retrieve by organism)",
439 NULL, NULL,NULL,TRUE,'o',ARG_STRING, 0.0,0,NULL},
440 {"ID (accession or gi)",
441 NULL, NULL,NULL,TRUE,'u',ARG_STRING, 0.0,0,NULL},
442 {"Logfile name:",
443 "nbatch.log", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL}
444 };
445
MakeCommandLineParameters(void)446 static BatchParamPtr MakeCommandLineParameters(void)
447 {
448 BatchParamPtr batchP;
449 FILE *fd;
450
451 if ( !GetArgs ("Nbatch",NUMARGS,dump_args) ) {
452 return NULL;
453 }
454
455 if( !ErrSetLogfile (dump_args[7].strvalue, ELOG_APPEND) ) {
456 exit(1);
457 }
458 ErrSetLogLevel(SEV_MAX);
459
460 batchP = (BatchParamPtr) MemNew(sizeof(BatchParam));
461
462 batchP->dump = TRUE;
463
464 if(dump_args[0].intvalue)
465 batchP->sequence = BSEQ_AA;
466 else
467 batchP->sequence = BSEQ_NA;
468
469 batchP->format = dump_args[1].intvalue;
470 batchP->single_entry = dump_args[2].intvalue;
471 batchP->html = (Uchar)dump_args[3].intvalue;
472 batchP->request = REQ_LIST;
473
474 if(dump_args[6].strvalue != NULL) {
475
476 batchP->file = StringSave(dump_args[6].strvalue);
477 batchP->id_lookup = TRUE;
478
479 } else if((batchP->organism =
480 StringSaveNoNull(dump_args[5].strvalue)) != NULL) {
481 batchP->file = StringSave("");
482 batchP->request = REQ_ORG;
483 } else if(dump_args[4].strvalue != NULL) {
484 if((fd = FileOpen(dump_args[4].strvalue, "r")) == NULL) {
485 ErrLogPrintf("Input file do not exists or empty\n");
486 return NULL;
487 }
488 if((batchP->file = WWWReadFileInMemory(fd, 0, TRUE)) == NULL)
489 return NULL;
490 FileClose(fd);
491
492 batchP->organism = StringSave("");
493
494 } else { /* Error no valid input found This MUST not happen */
495 return NULL;
496 }
497
498 return batchP;
499 }
BGenBankInit(void)500 BGenBankPtr BGenBankInit(void)
501 {
502 BGenBankPtr bgbp;
503 AsnModulePtr amp;
504
505 AsnTypePtr atp_bioseq_set;
506 AsnTypePtr atp_bioseq_set_level;
507 AsnTypePtr atp_bioseq_set_class;
508 AsnTypePtr atp_bioseq_set_release;
509 AsnTypePtr atp_bioseq_set_date;
510 AsnTypePtr atp_bioseq_set_seq_set;
511 AsnTypePtr atp_bioseq_set_seq_set_E;
512
513 Char release[] = "Nbatch Dump";
514 Char date_time[128];
515 NCBI_Date date={{0,0,0,0,255,255,255},NULL};
516 DataVal dv;
517
518 bgbp = (BGenBank*)MemNew(sizeof(BGenBank));
519 bgbp->aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
520
521 amp = AsnAllModPtr();
522
523 MACRO_atp_find(atp_bioseq_set,Bioseq-set);
524 MACRO_atp_find(atp_bioseq_set_level,Bioseq-set.level);
525 MACRO_atp_find(atp_bioseq_set_class,Bioseq-set.class);
526 MACRO_atp_find(atp_bioseq_set_release,Bioseq-set.release);
527 MACRO_atp_find(atp_bioseq_set_date,Bioseq-set.date);
528 MACRO_atp_find(atp_bioseq_set_seq_set,Bioseq-set.seq-set);
529 MACRO_atp_find(atp_bioseq_set_seq_set_E,Bioseq-set.seq-set.E);
530
531 if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set,NULL))
532 return NULL;
533 dv.intvalue = 0;
534
535 if(!AsnWrite(bgbp->aip,atp_bioseq_set_level,&dv))
536 return NULL;
537 dv.intvalue = 7;
538
539 if(!AsnWrite(bgbp->aip,atp_bioseq_set_class,&dv))
540 return NULL;
541 dv.ptrvalue = release;
542
543 if(!AsnWrite(bgbp->aip,atp_bioseq_set_release,&dv))
544 return NULL;
545
546 Nlm_DayTimeStr(date_time,TRUE,TRUE);
547 date.str=date_time;
548
549 if(!DateAsnWrite(&date,bgbp->aip,atp_bioseq_set_date))
550 return NULL;
551
552 if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set_seq_set,NULL))
553 return NULL;
554 AsnIoFlush(bgbp->aip);
555
556 bgbp->atp =atp_bioseq_set_seq_set_E;
557 bgbp->atp_bioseq_set_seq_set = atp_bioseq_set_seq_set;
558 bgbp->atp_bioseq_set = atp_bioseq_set;
559
560 return bgbp;
561 }
BGenBankClose(BGenBankPtr bgbp)562 void BGenBankClose(BGenBankPtr bgbp)
563 {
564 AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set_seq_set,NULL);
565 AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set,NULL);
566
567 AsnIoClose(bgbp->aip);
568 MemFree(bgbp);
569 return;
570 }
571
Main()572 Int2 Main ()
573 {
574 Int4 i, gi;
575 ByteStorePtr gis_bsp;
576 Int4 NumGis;
577 BatchParamPtr batchP = NULL;
578 BatchAccListPtr AccList, AccTmp;
579 Int4 TotalNumAcc =0;
580 time_t time_now;
581 CharPtr TimeNowStr;
582 WWWInfoPtr info;
583 WWWErrorCode error;
584 FILE *log_file = NULL;
585 BGenBankPtr bgbp = NULL;
586
587 if((error = WWWReadPosting(&info)) != WWWErrOk) {
588 ErrLogPrintf("Error in processing WWW request\n");
589 return 1;
590 }
591
592 if(WWWGetMethod(info) != COMMAND_LINE) {
593 if( !ErrSetLogfile (LogFile, ELOG_APPEND) ) {
594 return 1;
595 }
596 ErrSetLogLevel(SEV_MAX);
597 log_file = FileOpen(LogFile, "a");
598 }
599
600 if(WWWGetMethod(info) == COMMAND_LINE) {
601 if((batchP = MakeCommandLineParameters()) == NULL) {
602 return 1;
603 }
604 batchP->CommandLine = TRUE;
605 } else if (WWWGetMethod(info) == WWW_GET) {
606 time_now = GetSecs();
607 TimeNowStr = ctime(&time_now);
608 TimeNowStr[24] = '\0';
609 fprintf(log_file, "\n%s|%s|%s|%s|%d",
610 TimeNowStr, WWWGetAddress(info),
611 WWWGetHost(info), WWWGetAgent(info), 0);
612
613 if(WWWGetBrowser(info) != NETSCAPE)
614 WWWSendBatchPage(0);
615 else
616 WWWSendBatchPage(1);
617
618 FileClose(log_file);
619 return 1;
620 } else { /* method == POST */
621
622 if((batchP = MakeBatchParameters(WWWGetWWWEntries(info),
623 WWWGetNumEntries(info))) == NULL) {
624 printf("Content-type: text/html\r\n\r\n");
625 printf("Error in creating BATCH parameters");
626 FileClose(log_file);
627 return 1;
628 }
629 }
630
631 time_now = GetSecs();
632 TimeNowStr = ctime(&time_now);
633 TimeNowStr[24] = '\0';
634 if(!batchP->CommandLine) {
635 fprintf(log_file, "\n%s|%s|%s|%s|%d|%d|%d|%d|%d|%d|%s|%d|",
636 TimeNowStr, WWWGetAddress(info),
637 WWWGetHost(info), WWWGetAgent(info), (int)1,
638 (int)batchP->format, (int)batchP->html, (int)batchP->dump,
639 (int)batchP->request, (int)batchP->sequence,
640 (batchP->organism[0] == NULLB) ? "(null)" : batchP->organism,
641 (int)(batchP->file[0] == NULLB ? 0 : 1));
642 }
643 FileClose(log_file);
644
645 if((batchP->request == REQ_ORG) && (StringLen(batchP->organism) < 3)) {
646 if(!batchP->CommandLine) {
647 printf("<TITLE>Batch Entrez Results</TITLE>");
648 printf("%s%c<BR><BR><BR><b>", BatchResultsTitle, LF);
649 }
650 printf("ERROR: Length of organism name must be more "
651 "than 2 characters\n");
652 if(!batchP->CommandLine)
653 printf("</b>");
654 return 1;
655 }
656 if((batchP->file[0] == NULLB) && (batchP->request != REQ_ORG)) {
657 if(!batchP->CommandLine) {
658 printf("Content-type: text/html\r\n\r\n");
659 printf("<TITLE>Batch Entrez Results</TITLE>");
660 printf("%s%c<BR><BR><BR><b>", BatchResultsTitle, LF);
661 }
662 printf("ERROR: You did not entered filename "
663 "with gis/accessions or file may be empty.\n");
664 if(!batchP->CommandLine)
665 printf("</b>");
666 return 1;
667 }
668
669 if(!batchP->CommandLine) {
670
671 printf("HTTP/1.0 200 OK\r\n");
672 printf("MIME-Version: 1.0\r\n");
673
674 if(batchP->dump) {
675 printf("Content-type: application/octet-stream\r\n\r\n");
676 } else {
677 printf("Content-type: text/html\r\n\r\n");
678 printf("<TITLE>Batch Entrez Results</TITLE>");
679 printf("%s%c", BatchResultsTitle, LF);
680 }
681 }
682
683 #ifdef DEBUG_MODE
684 printf("Content-type: text/html\r\n\r\n");
685 for(i=0; i < m; i++) {
686 printf("%s : %s\n <BR>%c", entries[i]->name, entries[i]->val, LF);
687 }
688 exit(1);
689 #endif
690
691 putenv("USER=BatchEntrez");
692
693 if(batchP->html) {
694 init_www(); /* initializing WWW mode */
695 head_tail_ff(NULL, BatchHead, BatchTail);
696 }
697
698 if(!batchP->dump || batchP->html)
699 printf("<PRE>"); /* Entering text mode */
700
701 /* Starting up connection to Entrez */
702
703 if (! EntrezInit("BatchEntrez", FALSE, NULL)) {
704 printf("Cannot initialize Entrez\r\n");
705 return 1;
706 }
707
708 EntrezBioseqFetchEnable("BatchEntrez", TRUE);
709
710 /* Finding all gi's of given organism */
711
712 if(batchP->request == REQ_ORG) {
713 if((gis_bsp = GetGisFromOrg(batchP->organism,
714 &NumGis, batchP->sequence)) == NULL) {
715 printf("**** ERROR: Lookup for GI's of organism \"%s\" failed\r\n",
716 batchP->organism);
717 return 1;
718 } else {
719 if(!batchP->dump || (batchP->format != F_FASTA))
720 printf("**** %d gi's found for organism \"%s\"\r\n\r\n",
721 (int)NumGis, batchP->organism);
722 /* setting limit of printing */
723 if(WWWGetMethod(info) != COMMAND_LINE &&
724 NumGis > (batchP->format == F_DLIST ? SEARCH_DLIMIT : SEARCH_LIMIT)) {
725 printf("**** The number of retrieved GIs (%d) "
726 "exceeds the current limit of %d.\r\n"
727 "**** Only the list of GIs will be downloaded "
728 "to your computer. Please divide\r\n"
729 "**** this list into smaller parts and submit "
730 "them separately.\r\n\r\n",
731 (int)NumGis, (int)(batchP->format == F_DLIST ?
732 SEARCH_DLIMIT : SEARCH_LIMIT));
733 batchP->format = F_GILIST;
734 }
735 }
736 if(batchP->format != F_GILIST && batchP->html)
737 printf("<HR>");
738
739 if(batchP->format == F_ASN1_GENB)
740 bgbp = BGenBankInit();
741
742 for(i= 0; i < NumGis; i++) {
743 BSRead(gis_bsp, &gi, sizeof(Int4));
744 if(!PrintGi(gi, batchP->format, stdout,
745 batchP->sequence, batchP->single_entry, bgbp)) {
746 printf("Error in printing gi %d\r\n", (int) gi);
747 }
748 if(batchP->html)
749 printf("<HR>");
750 }
751
752 if(batchP->format == F_ASN1_GENB)
753 BGenBankClose(bgbp);
754
755 } else {
756
757 /* Printing all sequences from list of accessions in user's file */
758
759 if((AccList = GetAccList(batchP, &TotalNumAcc)) == NULL) {
760 printf("**** ERROR: No valid Gis/Accessions found\r\n");
761 return 1;
762 }
763
764 if(batchP->format != F_ASN1_GENB &&
765 batchP->format != F_FASTA &&
766 batchP->format != F_ASN1) {
767 if(!batchP->dump && !batchP->id_lookup)
768 printf("**** %d validly formatted Gis/Accessions present in "
769 "Entrez-batch request\r\n\r\n", (int)TotalNumAcc);
770 }
771
772 if(WWWGetMethod(info) != COMMAND_LINE) {
773 if(TotalNumAcc > (batchP->format == F_DLIST ?
774 SEARCH_DLIMIT : SEARCH_LIMIT)) {
775 printf("**** The number of retrieved Gis/Accessions (%d) "
776 "exceeds the current limit of %d.\r\n"
777 "**** Please divide this list into smaller parts and submit "
778 "them separately.\r\n\r\n",
779 (int)TotalNumAcc, (int)(batchP->format == F_DLIST ?
780 SEARCH_DLIMIT : SEARCH_LIMIT));
781 return 1;
782 }
783 }
784 if(batchP->html)
785 printf("<HR>");
786
787 if(batchP->format == F_ASN1_GENB)
788 bgbp = BGenBankInit();
789
790 for (AccTmp = AccList; AccTmp != NULL; AccTmp = AccTmp->next) {
791 if(!PrintGi(AccTmp->gi, batchP->format,
792 stdout, batchP->sequence, batchP->single_entry, bgbp)) {
793 printf("**** WARNING: Printing Gi %d (%s) failed \r\n\r\n",
794 (int)AccTmp->gi, AccTmp->acc);
795 }
796
797 if(batchP->html)
798 printf("<HR>");
799 }
800 if(batchP->format == F_ASN1_GENB)
801 BGenBankClose(bgbp);
802 }
803
804 /* terminating Entrez connection */
805
806 if(!batchP->CommandLine)
807 printf("\r\n\r\n**** Transfer completed successfuly -------=-*\r\n\r\n");
808
809 EntrezFini();
810 EntrezBioseqFetchDisable();
811 fflush(stdout);
812
813 Nlm_FreeArgs(NUMARGS, dump_args);
814
815 return 0;
816 }
817
818
MakeBatchParameters(WWWEntryPtr PNTR entries,Int4 m)819 static BatchParamPtr MakeBatchParameters(WWWEntryPtr PNTR entries, Int4 m)
820 {
821 Int4 i;
822 BatchParamPtr batchP;
823 Boolean OrganismSet = FALSE;
824
825 batchP = (BatchParamPtr) MemNew(sizeof(BatchParam));
826
827 batchP->format = F_GEN;
828 batchP->dump = FALSE;
829 batchP->request = REQ_ORG;
830 batchP->sequence = BSEQ_NA;
831 batchP->organism = NULLB;
832 batchP->file = NULLB;
833 batchP->html = FALSE;
834 batchP->single_entry = TRUE;
835
836 for(i=0; i < m; i++) {
837
838 /* FORMAT */
839
840 if (!StringICmp(entries[i]->name, "FORMAT")) {
841 if (!StringICmp(entries[i]->val, "GenBank/GenPept"))
842 batchP->format = F_GEN;
843 else if (!StringICmp(entries[i]->val, "FASTA"))
844 batchP->format = F_FASTA;
845 else if (!StringICmp(entries[i]->val, "ASN.1"))
846 batchP->format = F_ASN1;
847 else if (!StringICmp(entries[i]->val, "List of GIs"))
848 batchP->format = F_GILIST;
849 else if (!StringICmp(entries[i]->val, "List of Deflines"))
850 batchP->format = F_DLIST;
851
852 /* DUMP TYPE */
853
854 } else if (!StringICmp(entries[i]->name, "DUMP_TYPE")) {
855 batchP->dump = TRUE;
856 } else if (!StringICmp(entries[i]->name, "RETURN_ALL_SET")) {
857 batchP->single_entry = FALSE;
858
859 /* REQUEST TYPE */
860
861 } else if (!StringICmp(entries[i]->name, "REQUEST_TYPE")) {
862 if (!StringICmp(entries[i]->val, "ORGANISM"))
863 batchP->request = REQ_ORG;
864 else if (!StringICmp(entries[i]->val, "FILESUBMIT"))
865 batchP->request = REQ_LIST;
866
867 /* SEQUENCE TYPE */
868
869 } else if (!StringICmp(entries[i]->name, "SEQ_TYPE")) {
870 if (!StringICmp(entries[i]->val, "nucleotide"))
871 batchP->sequence = BSEQ_NA;
872 else if (!StringICmp(entries[i]->val, "protein"))
873 batchP->sequence = BSEQ_AA;
874
875 /* ORGANISM NAME */
876
877 } else if (!StringICmp(entries[i]->name, "ORGNAME")) {
878 batchP->organism = entries[i]->val;
879 if(entries[i]->val[0] != NULLB)
880 OrganismSet = TRUE;
881 } else if (!StringICmp(entries[i]->name, "LIST_ORG") && !OrganismSet) {
882 if (StringICmp(entries[i]->val, "(None)"))
883 batchP->organism = entries[i]->val;
884
885 /* USER FILE */
886
887 } else if (!StringICmp(entries[i]->name, "USERFILE")) {
888 batchP->file = entries[i]->val;
889 } else if (!StringICmp(entries[i]->name, "HTML")) {
890 batchP->html = TRUE;
891 }
892 } /* for (i=0... */
893
894 return batchP;
895
896 }
897
AccessionToGi(CharPtr string,Int4Ptr PNTR giptr,Int2 seqtype)898 static Int4 AccessionToGi(CharPtr string, Int4Ptr PNTR giptr, Int2 seqtype)
899 {
900 Int4 i, gi = 0;
901 CharPtr str;
902 ByteStorePtr bsp;
903 Int4 GiNum;
904
905 if(string == NULL)
906 return 0;
907
908 if((gi = atol(string)) > 0) { /* He-he this is gi... */
909 *giptr = (Int4Ptr) MemNew(sizeof(Int4));
910 (*giptr)[0] = gi;
911 return 1;
912 }
913 str = (CharPtr) MemNew(StringLen(string)+30);
914 sprintf(str, "\"%s\"[ACCN]", string);
915
916 if((bsp = EntrezTLEvalXString(str, seqtype,
917 -1, NULL, NULL)) == NULL) {
918 ErrLogPrintf("Failure to parse input string");
919 return 0;
920 }
921 MemFree(str);
922
923 BSSeek(bsp, 0L, 0);
924
925 if((GiNum = BSLen(bsp)/sizeof(DocUid)) < 1) {
926 /* Every accession must have one and ONLY ONE gi */
927 return 0;
928 }
929 *giptr = (Int4Ptr) MemNew(sizeof(Int4)*GiNum);
930 for(i = 0; i < GiNum; i++) {
931 BSRead(bsp, &gi, sizeof(Int4));
932 (*giptr)[i] = gi;
933 }
934
935 BSFree(bsp);
936 return GiNum;
937 }
938
PrintGi(Int4 gi,Int4 format,FILE * fd,Int4 seq_type,Int4 single_entry,BGenBankPtr bgbp)939 static Boolean PrintGi(Int4 gi, Int4 format, FILE *fd,
940 Int4 seq_type, Int4 single_entry,
941 BGenBankPtr bgbp)
942 {
943 SeqEntryPtr sep, sep_all;
944 AsnIoPtr aip;
945 Boolean retvalue = TRUE;
946 Boolean is_na = (Boolean)(seq_type == BSEQ_NA);
947 SeqIdPtr sip = NULL;
948 BioseqPtr bsp;
949
950 if(format == F_GILIST) {
951 fprintf(fd, "%d\r\n", (int)gi);
952 return TRUE;
953 }
954
955 if((sep_all = EntrezSeqEntryGet(gi, single_entry)) == NULL) {
956 printf("**** WARNING: NULL Seq Entry pointer - "
957 "request failed for gi %d\r\n", (int)gi);
958 return FALSE;
959 }
960
961 if(single_entry) {
962 ObjMgrRegister(OBJ_SEQENTRY, sep_all);
963 sip = ValNodeNew(NULL);
964 sip->choice = SEQID_GI;
965 sip->data.intvalue = gi;
966
967 if((bsp = BioseqFind(sip)) == NULL) {
968 printf("**** WARNING: NULL Bioseq pointer - "
969 "request failed for gi %d\r\n", (int)gi);
970 return FALSE;
971 }
972 sep = SeqEntryNew();
973 sep->choice = 1; /* Bioseq */
974 sep->data.ptrvalue = bsp;
975 } else {
976 sep = sep_all;
977 }
978
979 switch(format) {
980 case F_GEN:
981 if(!SeqEntryToFlatEx(sep_all, fd,
982 (Uint1) (is_na ? GENBANK_FMT : GENPEPT_FMT),
983 RELEASE_MODE, sip, FF_REGULAR)) {
984
985 if(!SeqEntryToFlatEx(sep_all, fd,
986 (Uint1)(is_na ? GENPEPT_FMT : GENBANK_FMT),
987 RELEASE_MODE, sip, FF_REGULAR)) {
988 retvalue = FALSE;
989 }
990 }
991 break;
992 case F_FASTA:
993 if(!SeqEntryToFasta(sep, fd, is_na)) {
994 if(!SeqEntryToFasta(sep, fd, (Uint1)!is_na)) {
995 printf("**** WARNING: Printing of FASTA format "
996 "(gi=%d) failed\r\n", (int)gi);
997 retvalue = FALSE;
998 }
999 }
1000 break;
1001 case F_DLIST:
1002
1003 if (IS_Bioseq(sep))
1004 retvalue = SeqEntrysToDefline(sep, fd, is_na, 3);
1005 else
1006 retvalue = SeqEntrysToDefline(sep, fd, is_na, 0);
1007
1008 if(retvalue == FALSE) {
1009 if (IS_Bioseq(sep))
1010 retvalue = SeqEntrysToDefline(sep, fd, (Uint1) !is_na, 3);
1011 else
1012 retvalue = SeqEntrysToDefline(sep, fd, (Uint1) !is_na, 0);
1013 }
1014
1015 break;
1016
1017 case F_ASN1:
1018 aip = AsnIoNew(ASNIO_TEXT_OUT, fd, NULL, NULL, NULL);
1019 retvalue = SeqEntryAsnWrite(sep, aip, NULL);
1020 AsnIoClose(aip);
1021 break;
1022 case F_ASN1_GENB:
1023 retvalue = SeqEntryAsnWrite(sep, bgbp->aip, bgbp->atp);
1024 break;
1025 default:
1026 return FALSE;
1027 }
1028
1029
1030 if(single_entry) {
1031 SeqIdFree(sip);
1032 MemFree(sep);
1033 }
1034
1035 SeqEntryFree(sep_all);
1036
1037 return retvalue;
1038 }
1039
1040
GetGisFromOrg(CharPtr org,Int4Ptr GiNum,Int4 seqtype)1041 static ByteStorePtr GetGisFromOrg(CharPtr org, Int4Ptr GiNum, Int4 seqtype)
1042 {
1043 ByteStorePtr bsp;
1044 CharPtr str;
1045
1046 str = (CharPtr) MemNew(StringLen(org)+30);
1047 sprintf(str, "\"%s\"[ORGN]", org);
1048
1049 if((bsp = EntrezTLEvalXString(str, (Int2) seqtype, -1, NULL, NULL)) == NULL) {
1050 ErrLogPrintf("Failure to parse input string");
1051 return NULL;
1052 }
1053 BSSeek(bsp, 0L, 0);
1054 *GiNum = BSLen(bsp)/sizeof(DocUid);
1055 return bsp;
1056 }
1057
GetAccList(BatchParamPtr batchP,Int4Ptr TotalItems)1058 static BatchAccListPtr GetAccList(BatchParamPtr batchP,
1059 Int4Ptr TotalItems)
1060 {
1061 Char TmpBuff[16];
1062 Int4 i, j, k;
1063 Int4 FileLen = 0;
1064 BatchAccListPtr AccList = NULL;
1065 BatchAccListPtr AccListTmp, AccListLast;
1066 Int4 NumNotValid = 0;
1067 Int4 NumGis;
1068 Int4Ptr giptr;
1069 CharPtr file;
1070 Int2 seqtype;
1071
1072 if (!batchP || (file = batchP->file) == NULL || !file[0]) {
1073 *TotalItems = 0;
1074 return NULL;
1075 }
1076 seqtype = batchP->sequence;
1077
1078 FileLen = StringLen(file);
1079
1080 for(i = 0; i < FileLen; i++) {
1081
1082 if(isspace(file[i]) || file[i] == ',') /* Rolling spaces */
1083 continue;
1084
1085 /* This is defence from badly formatted requests */
1086
1087 if(!batchP->CommandLine && NumNotValid > 10) {
1088 printf("**** ERROR: Too many invalid Gis/Accessions, "
1089 "parsing aborted\n");
1090 *TotalItems = 0;
1091 return NULL;
1092 }
1093
1094 /* Rolling spaces */
1095
1096 j= 0;
1097 while (!isspace(file[i]) && j < 10 && i < FileLen) {
1098 TmpBuff[j] = file[i];
1099 j++; i++;
1100 if(file[i] == ',') /* Comma is valid delimiter */
1101 break;
1102 }
1103 TmpBuff[j] = NULLB;
1104
1105
1106 /* Ignore strings like ">Protein" */
1107
1108 if(j > 0 && TmpBuff[0] == '>' && IS_ALPHA(TmpBuff[1]))
1109 continue;
1110
1111 /* Is gi/accession too long ??? */
1112
1113 if(j == 10) {
1114 printf("**** WARNING: Gi/Accession \"%s\" is too long\r\n",
1115 TmpBuff);
1116 NumNotValid++;
1117
1118 while(!isspace(file[i]) ||
1119 file[i] == ',' ||
1120 file[i] == NULLB) /* Rolling until spaces */
1121 i++;
1122 continue; /* Next may be valid ... who knows...?? */
1123 }
1124
1125 /* Now validating accession/gi */
1126
1127 for(k =0; k < j; k++) {
1128 if(!isdigit(TmpBuff[k])) {
1129 break;
1130 }
1131 }
1132 if(k != j) {
1133 if(!IS_ntdb_accession(TmpBuff) && !IS_protdb_accession(TmpBuff)) {
1134 printf("**** WARNING: Gi/Accession \"%s\" is not valid\r\n",
1135 TmpBuff);
1136 NumNotValid++;
1137 continue;
1138 }
1139 }
1140
1141 /* If this is valid Accession check and tranfer it to gi */
1142
1143 giptr = NULL;
1144 if((NumGis = AccessionToGi(TmpBuff, &giptr, seqtype)) == 0) {
1145 printf("**** WARNING: Gi/Accession %s is not found "
1146 "in database----\r\n",
1147 TmpBuff);
1148 NumNotValid++;
1149 continue;
1150 } else {
1151 for (j = 0; j < NumGis; j++) {
1152 /* It we come here - we got valid text ID */
1153
1154 if(AccList == NULL) { /* first element */
1155 AccList = (BatchAccListPtr) MemNew(sizeof(BatchAccList));
1156 AccListTmp = AccList;
1157 AccListTmp->acc = StringSave(TmpBuff);
1158 AccListTmp->gi = giptr[j];
1159 AccListTmp->next = NULL;
1160 AccListLast=AccListTmp;
1161 *TotalItems = *TotalItems +1;
1162 } else {
1163 AccListTmp = (BatchAccListPtr)
1164 MemNew(sizeof(BatchAccList));
1165 AccListLast->next = AccListTmp;
1166 AccListTmp->acc = StringSave(TmpBuff);
1167 AccListTmp->gi = giptr[j];
1168 AccListTmp->next = NULL;
1169 AccListLast = AccListTmp;
1170 *TotalItems = *TotalItems +1;
1171 }
1172 }
1173 MemFree(giptr);
1174 }
1175 }
1176 if(NumNotValid) {
1177 printf("**** %d invalid Gi%s/Accession%s present in Entrez-batch "
1178 "request\r\n",
1179 (int)NumNotValid,
1180 NumNotValid == 1 ? "" : "s",
1181 NumNotValid == 1 ? "" : "s"
1182 );
1183 }
1184 return AccList;
1185 }
BatchHead(VoidPtr pointer,FILE * fd)1186 void BatchHead(VoidPtr pointer, FILE *fd)
1187 {
1188 return;
1189 }
BatchTail(VoidPtr pointer,FILE * fd)1190 void BatchTail(VoidPtr pointer, FILE *fd)
1191 {
1192 return;
1193 }
1194
1195