1 /* $Id: batch.c,v 6.19 2000/08/30 16:44:20 vakatov Exp $
2 * ===========================================================================
3 *
4 *                            PUBLIC DOMAIN NOTICE
5 *               National Center for Biotechnology Information
6 *
7 *  This software/database is a "United States Government Work" under the
8 *  terms of the United States Copyright Act.  It was written as part of
9 *  the author's official duties as a United States Government employee and
10 *  thus cannot be copyrighted.  This software/database is freely available
11 *  to the public for use. The National Library of Medicine and the U.S.
12 *  Government have not placed any restriction on its use or reproduction.
13 *
14 *  Although all reasonable efforts have been taken to ensure the accuracy
15 *  and reliability of the software and data, the NLM and the U.S.
16 *  Government do not and cannot warrant the performance or results that
17 *  may be obtained by using this software or data. The NLM and the U.S.
18 *  Government disclaim all warranties, express or implied, including
19 *  warranties of performance, merchantability or fitness for any particular
20 *  purpose.
21 *
22 *  Please cite the author in any work or product based on this material.
23 *
24 * ===========================================================================
25 *
26 * File Name:  $RCSfile: batch.c,v $
27 *
28 * Author:  Sergei Shavirin
29 *
30 * Version Creation Date: 12/16/1996
31 *
32 * $Revision: 6.19 $
33 *
34 * File Description:
35 *         Main file for WWW and Command Line BatchEntrez programs
36 *
37 * $Log: batch.c,v $
38 * Revision 6.19  2000/08/30 16:44:20  vakatov
39 * Fixed printf() format mismatch
40 *
41 * Revision 6.18  2000/05/09 13:37:01  shavirin
42 * Use functions IS_ntdb_accession() and IS_protdb_accession() to
43 * verify accession number.
44 *
45 * Revision 6.17  2000/02/03 21:00:38  beloslyu
46 * fix the NCBI_Date initialization
47 *
48 * Revision 6.16  1999/10/21 21:10:04  shavirin
49 * Added possibility to retrive RefSeq accessions.
50 *
51 * Revision 6.15  1999/02/24 16:49:23  kans
52 * use accutils copy of IS_ntdb_accession and IS_protdb_accession
53 *
54 * Revision 6.14  1998/12/15 17:56:05  vakatov
55 * Fixed a tyny C++ compilation bug
56 *
57 * Revision 6.13  1998/07/07 13:43:41  shavirin
58 * Fixed warning of tough compiler setting.
59 *
60 * Revision 6.12  1998/05/19 21:54:05  shavirin
61 * Fixed function, that prints Batch Entrez WWW page
62 *
63 * Revision 6.11  1998/05/08 15:51:30  vakatov
64 * fixed UMR and a tiny typo;  cleaned up some code
65 *
66 * Revision 6.10  1998/05/01 17:57:47  shavirin
67 * New revision
68 *
69 * Revision 6.9  1998/04/17 20:53:50  shavirin
70 * Check for accession format was made more "relaxed".
71 *
72 * Revision 6.8  1998/03/26 21:08:42  shavirin
73 * Changed exit(1) -> return 1 in Main() function.
74 *
75 * Revision 6.7  1997/12/10 18:00:24  shavirin
76 * Removed limits on number of gis to retrieve from command line mode
77 *
78 * Revision 6.5  1997/12/09 16:13:44  shavirin
79 * Removed message in ASN1_GENB case
80 *
81 * Revision 6.4  1997/12/01 20:09:56  shavirin
82 * Removed message in front of ASN1 outputs
83 *
84 * Revision 6.3  1997/11/26 21:57:13  shavirin
85 * Added format 5 - Single GenBank Bioseq-set
86 *
87 * Revision 6.2  1997/11/03 20:48:42  shavirin
88 * Added workaround for API bug with single gi retrieval
89 *
90 * Revision 6.1  1997/09/10 14:05:34  shavirin
91 * Added AE- type of accesssions handling
92 *
93 * Revision 6.0  1997/08/25 18:19:05  madden
94 * Revision changed to 6.0
95 *
96 * Revision 1.17  1997/07/23 19:24:32  shavirin
97 * Changed default background to white
98 *
99 * Revision 1.16  1997/07/22 18:57:30  shavirin
100 * Removed any limits for number of retrieved entryes
101 * if program used from command line
102 *
103 * Revision 1.15  1997/07/21 15:03:18  shavirin
104 * Now strings like ">Protein" will be ignored
105 *
106 * Revision 1.14  1997/07/03 16:23:17  shavirin
107 * Added ability to retrieve few gis from single accession
108 *
109 * Revision 1.13  1997/06/27 18:32:59  shavirin
110 * Added AF- style nucleotide accessions to be accepted
111 *
112 * Revision 1.12  1997/05/14 19:14:31  shavirin
113 * Added #define LF 10
114 *
115  * Revision 1.11  1997/04/25  04:25:21  shavirin
116  * Few fixes due to usage of the program through proxy and small
117  * bug with reading from file
118  *
119  * Revision 1.10  1997/04/09  19:29:24  shavirin
120  * Included ability to retrieve Protein accessions
121  *
122  * Revision 1.9  1997/03/28  18:23:13  shavirin
123  * Use PubMed accession index instead of SeqId index. Removed "www,www3"
124  * references for better proxying.
125  *
126  * Revision 1.8  1997/03/14  15:38:38  shavirin
127  * Removed difference between capital and small characters for
128  * accesssion number checkup.
129  *
130  * Revision 1.7  1997/03/13  16:15:52  shavirin
131  * Added new option for WWW Batch Entrez to retrieve single entry
132  * or complete set.
133  *
134  * Revision 1.6  1997/03/12  22:47:41  shavirin
135  * Added option to return only one entry from one entry
136  *
137  * Revision 1.5  1997/03/04  17:19:22  shavirin
138  * Fixed parser for long invalid accessions and added comma as
139  * valid delimiter of accesssions/gis
140  *
141  * Revision 1.4  1997/01/23  19:02:37  shavirin
142  * Removed creation of spurious logfiles in command-line mode
143  *
144  * Revision 1.3  1996/12/17  17:27:18  shavirin
145  * Function WWWSendBatchPage() changed to static
146  *
147  * Revision 1.2  1996/12/16  19:55:35  shavirin
148  * Changed file description.
149  *
150  * Revision 1.1  1996/12/16  19:51:37  shavirin
151  * Initial revision
152  *
153 *
154 * ==========================================================================
155 */
156 
157 #define LogFile "wwwbatch.log"
158 
159 #include <ncbi.h>
160 #include <ffprint.h>
161 #include <accentr.h>
162 #include <accutils.h>
163 #include <tofasta.h>
164 #include <asn2ff.h>
165 #include <ncbiwww.h>
166 
167 #define LF 10
168 
169 typedef struct BatchAccList {
170   CharPtr acc;
171   Int4 gi;
172   struct BatchAccList *next;
173 } BatchAccList, PNTR BatchAccListPtr;
174 
175 typedef struct BatchParam {
176     Int4     format;
177     Boolean  dump;
178     Int4     single_entry;
179     Int4     request;
180     Int4     sequence;
181     CharPtr  organism;
182     CharPtr  file;
183     Boolean  html;
184     Boolean  id_lookup;
185     Boolean  CommandLine;
186 } BatchParam, PNTR BatchParamPtr;
187 
188 typedef struct BGenBank {
189     AsnIoPtr   aip;
190     AsnTypePtr atp;
191     AsnTypePtr atp_bioseq_set_seq_set;
192     AsnTypePtr atp_bioseq_set;
193 } BGenBank, PNTR BGenBankPtr;
194 
195 #define REQ_ORG   0
196 #define REQ_LIST  1
197 
198 #define F_GEN       0
199 #define F_FASTA     1
200 #define F_ASN1      2
201 #define F_GILIST    3
202 #define F_DLIST     4
203 #define F_ASN1_GENB 5
204 
205 #define BSEQ_NA   TYP_NT
206 #define BSEQ_AA   TYP_AA
207 
208 #define SEARCH_LIMIT  20000
209 #define SEARCH_DLIMIT 70000
210 
211 static void WWWSendBatchPage(Int4 which);
212 
213 static Int4 AccessionToGi(CharPtr string,  Int4Ptr PNTR giptr, Int2 seqtype);
214 static BatchAccListPtr GetAccList(BatchParamPtr batchP, Int4Ptr total);
215 
216 static Boolean PrintGi(Int4 gi, Int4 format,
217                        FILE *fd, Int4 seq_type,
218                        Int4 single_entry, BGenBankPtr bgbp);
219 
220 static BatchParamPtr MakeBatchParameters(WWWEntryPtr PNTR entries, Int4 m);
221 static ByteStorePtr GetGisFromOrg(CharPtr org,
222                              Int4Ptr GiNum, Int4 seqtype);
223 
224 extern Boolean SeqEntrysToDefline(SeqEntryPtr sep,
225                                   FILE *fp, Boolean is_na, Uint1 group_segs);
226 static BatchParamPtr MakeCommandLineParameters(void);
227 
228 void BatchHead(VoidPtr pointer, FILE *fd);
229 void BatchTail(VoidPtr pointer, FILE *fd);
230 
231 static CharPtr organism[] = { "(None)",
232                               "Arabidopsis thaliana",
233                               "Bacillus subtilis",
234                               "Bos taurus",
235                               "Caenorhabditis elegans",
236                               "Dictyostelium discoideum",
237                               "Drosophila melanogaster",
238                               "Escherichia coli",
239                               "Gallus gallus",
240                               "Homo sapiens",
241                               "Human immunodeficiency virus type 1",
242                               "Mus musculus",
243                               "Oryctolagus cuniculus",
244                               "Oryza sativa",
245                               "Ovis aries",
246                               "Rattus norvegicus",
247                               "Saccharomyces cerevisiae",
248                               "Schizosaccharomyces pombe",
249                               "Simian immunodeficiency virus",
250                               "Xenopus laevis",
251                               "Zea mays",
252                               NULL
253 };
254 
255 
256 #define MACRO_atp_find(atp,name)\
257         if((atp = AsnTypeFind(amp, #name))==NULL){\
258                 ErrPostEx(SEV_ERROR,0,0,\
259                         "Could not find type <%s>", #name);\
260                 return NULL; \
261         }
262 
263 
264 #define BatchTitle "<A HREF=\"/htbin-post/PubMed/imagemap/EntrezBatch/batch.map\"><IMG SRC=\"/EntrezBatch/batch.gif\" BORDER=0 ISMAP HEIGHT=22 WIDTH=500></A>"
265 
266 #define BatchResultsTitle "<A HREF=\"/htbin-post/PubMed/imagemap/EntrezBatch/batch.map\"><IMG SRC=\"/EntrezBatch/batch_results.gif\" BORDER=0 ISMAP HEIGHT=22 WIDTH=500></A>"
267 /************************************************************************
268  *
269  *  void WWWSendPage(Int4 which) - function to draw entry table for
270  *                                 the WWW Blast program
271  *
272  ***********************************************************************/
273 
WWWSendBatchPage(Int4 which)274 static void WWWSendBatchPage(Int4 which) {
275   register Int4 i;
276 
277   printf("HTTP/1.0 200 OK\r\n");
278   printf("MIME-Version: 1.0\r\n");
279 
280   printf("Content-type: text/html\r\n\r\n");
281   printf("<HTML>\n<HEAD>\n");
282   printf("<TITLE>Batch Entrez</TITLE>\n</HEAD>\n");
283 
284   printf("<BODY bgcolor=\"#FFFFFF\" text=\"#000000\" "
285          "link=\"#0000f0\" vlink=\"#6000b0\" alink=\"#f00000\">\n");
286 
287   printf("%s%c", BatchTitle, LF);
288 
289   printf("<FORM ACTION=\"http://%s:%s%s/result\" METHOD=POST "
290          "NAME=\"BATCH\" %s >%c",
291          getenv("SERVER_NAME") != NULL ? getenv("SERVER_NAME") : "NOT_SET",
292          getenv("SERVER_PORT") != NULL ? getenv("SERVER_PORT") : "NOT_SET",
293          getenv("SCRIPT_NAME") != NULL ? getenv("SCRIPT_NAME") : "NOT_SET",
294          which ? "ENCTYPE=\"multipart/form-data\" " : "", LF);
295 
296   printf("<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp; This page is designed "
297          "to allow you to download (receive) a large "
298          "number of sequences from Entrez, in a batch mode. "
299          "The results of the search will be saved to a local "
300          "disk file on your machine.  Upon submitting your query, "
301          "you will be prompted to provide the filename "
302          "where the results will be stored. Please make sure that "
303          "you have enough disk space on your computer before "
304          "submitting this request.\n%c", LF);
305 
306   printf("<BR><BR><B>Choose type of sequences "
307          "to search and format of output: </B><BR><BR>");
308 
309   printf("Sequence type: %c",LF);
310 
311   printf("<select name = SEQ_TYPE>");
312   printf("<option> Nucleotide ");
313   printf("<option> Protein ");
314   printf("</select>");
315 
316   printf("&nbsp;&nbsp;"
317          "<INPUT TYPE=\"checkbox\" NAME=\"RETURN_ALL_SET\" "
318          "VALUE=YES>&nbsp;&nbsp; "
319          "Include all records within a segmented set");
320 
321   printf("<BR>");
322 #ifdef NOT_SAVE
323   printf("<INPUT TYPE=\"checkbox\" NAME=\"DUMP_TYPE\" "
324          "VALUE=FILE></B>&nbsp;&nbsp; Save results to file");
325 #else
326   printf("<INPUT TYPE=\"hidden\" NAME=\"DUMP_TYPE\" "
327          "VALUE=FILE >");
328 #endif
329 
330   printf("</B>Format: &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"
331          "&nbsp;&nbsp;&nbsp;&nbsp;%c",LF);
332 
333   printf("<select name = FORMAT>");
334   printf("<option> GenBank/GenPept");
335   printf("<option> FASTA");
336   printf("<option> ASN.1");
337   printf("<option> List of GIs");
338   printf("<option> List of Deflines");
339   printf("</select>");
340 
341   printf("<INPUT TYPE=\"checkbox\" NAME=\"HTML\" "
342          "VALUE=HTML CHECKED>&nbsp;&nbsp; HTML");
343 
344   printf("<HR>");
345 
346   printf("<INPUT TYPE=\"radio\" NAME=\"REQUEST_TYPE\" "
347          "VALUE=ORGANISM CHECKED> &nbsp;&nbsp;&nbsp; <B>Retrieve all "
348          "sequences for a specific organism.</B>");
349 
350   printf("<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"
351          "Enter organism name here"
352          /*  "(use full scientific name or common name)" */
353          "</B>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"
354          "<INPUT TYPE=\"text\" NAME=\"ORGNAME\" "
355          "VALUE=\"\" MAXLENGTH=\"50\">");
356 
357   printf("<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;"
358          "Or choose it from list: <select name=LIST_ORG>");
359   for(i=0; organism[i] != NULL; i++)
360     printf("<option> %s ", organism[i]);
361   printf("</select>");
362   printf("<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;<A HREF="
363          "\"/Taxonomy/tax.html\">"
364          "Explore the taxonomy database at NCBI</A>");
365 
366   printf("<HR>");
367   printf("<INPUT TYPE=\"radio\" NAME=\"REQUEST_TYPE\" "
368          "VALUE=FILESUBMIT > &nbsp;&nbsp;&nbsp; <B>Retrieve all sequences from a "
369          "%s of Gis/Accessions</b>",
370          which? "file" : "list");
371 
372   if(!which) {
373 
374     printf("<BR>Enter gis/accessions here "
375            "(delimited by spaces or newlines)<BR> ");
376     printf("<textarea name=\"USERFILE\" rows=6 cols=60>"
377            "</textarea>%c", LF);
378   } else {
379     printf("<BR>&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;");
380     printf("Enter filename here &nbsp; "
381            "<INPUT TYPE=\"file\" NAME=\"USERFILE\" "
382            "onFocus=\"window.status='Press radio button to "
383            "activate this search type'; return true;\" ");
384   }
385 
386   printf("<HR>");
387   printf("<BR><INPUT TYPE=\"submit\">%c", LF);
388   printf("<INPUT TYPE=\"reset\" VALUE=\"Clear input\">%c", LF);
389   printf("</FORM>%c", LF);
390   printf("<HR>%c", LF);
391 
392   printf("<ADDRESS>");
393   printf("Comments and suggestions to:"
394          "&lt; <a href=\"mailto:info@ncbi.nlm.nih.gov\">"
395          "info@ncbi.nlm.nih.gov"
396          "</a> &gt; <BR> Credits to: "
397          "<a href=\"mailto:shavirin@ncbi.nlm.nih.gov\">"
398          "Sergei B. Shavirin</a>\n"
399          "<!-- <a href=\"http://www.ncbi.nlm.nih.gov/STS/shavirin.html\">"
400          "Sergei B. Shavirin</a> -->"
401          "<BR>Acknowledgements to: "
402          "<a href=\"mailto:epstein@ncbi.nlm.nih.gov\">"
403          "Jonathan Epstein</a>");
404   printf("</ADDRESS>%c", LF);
405 
406 } /* WWWSendBatchPage() */
407 
408 /************************************************************************
409  *
410  *  Int2 Main() - main function for the WWW BatchEntrez search program
411  *
412  ***********************************************************************/
413 #define NUMARGS 8
414 
415 Args dump_args[NUMARGS] = {
416   {"Sequence type\n"
417    "         0 - Nucleotide  \n"
418    "         1 - Protein",
419    NULL, NULL,NULL,FALSE,'s',ARG_INT,0.0,0,NULL},
420   {"Format of output\n"
421    "         0 - GenBank/GenPept   \n"
422    "         1 - FASTA \n"
423    "         2 - ASN.1 \n"
424    "         3 - List of GIs\n"
425    "         4 - List of Deflines\n"
426    "         5 - Single GenBank Bioseq-set ASN.1",
427    "0", NULL,NULL,FALSE,'f',ARG_INT, 0.0,0,NULL},
428   {"Number of returned entries from single gi/accession\n"
429    "         0 - Return all entries in SeqEntry, that available  \n"
430    "         1 - Return single entry specified by gi/accession",
431    "0", NULL,NULL,FALSE,'n',ARG_INT,0.0,0,NULL},
432   {"Text or HTML? (For GenBank/GenPept)\n"
433    "         0 - Text output   \n"
434    "         1 - HTML output",
435    "0", NULL,NULL,FALSE,'h',ARG_INT, 0.0,0,NULL},
436   { "File with list of GIS/Accessions",
437     "stdin", NULL, NULL, TRUE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
438   {"Organism name (for retrieve by organism)",
439    NULL, NULL,NULL,TRUE,'o',ARG_STRING, 0.0,0,NULL},
440   {"ID (accession or gi)",
441    NULL, NULL,NULL,TRUE,'u',ARG_STRING, 0.0,0,NULL},
442   {"Logfile name:",
443    "nbatch.log", NULL,NULL,TRUE,'l',ARG_FILE_OUT, 0.0,0,NULL}
444 };
445 
MakeCommandLineParameters(void)446 static BatchParamPtr MakeCommandLineParameters(void)
447 {
448   BatchParamPtr batchP;
449   FILE *fd;
450 
451   if ( !GetArgs ("Nbatch",NUMARGS,dump_args) ) {
452     return NULL;
453   }
454 
455   if( !ErrSetLogfile (dump_args[7].strvalue, ELOG_APPEND) ) {
456     exit(1);
457   }
458   ErrSetLogLevel(SEV_MAX);
459 
460   batchP = (BatchParamPtr) MemNew(sizeof(BatchParam));
461 
462   batchP->dump     = TRUE;
463 
464   if(dump_args[0].intvalue)
465     batchP->sequence = BSEQ_AA;
466   else
467     batchP->sequence = BSEQ_NA;
468 
469   batchP->format  =        dump_args[1].intvalue;
470   batchP->single_entry =        dump_args[2].intvalue;
471   batchP->html    = (Uchar)dump_args[3].intvalue;
472   batchP->request = REQ_LIST;
473 
474   if(dump_args[6].strvalue != NULL) {
475 
476     batchP->file = StringSave(dump_args[6].strvalue);
477     batchP->id_lookup = TRUE;
478 
479   } else if((batchP->organism =
480              StringSaveNoNull(dump_args[5].strvalue)) != NULL) {
481     batchP->file = StringSave("");
482     batchP->request = REQ_ORG;
483   } else if(dump_args[4].strvalue != NULL) {
484     if((fd = FileOpen(dump_args[4].strvalue, "r")) == NULL) {
485       ErrLogPrintf("Input file do not exists or empty\n");
486       return NULL;
487     }
488     if((batchP->file = WWWReadFileInMemory(fd, 0, TRUE)) == NULL)
489       return NULL;
490     FileClose(fd);
491 
492     batchP->organism = StringSave("");
493 
494   } else { /* Error no valid input found This MUST not happen */
495     return NULL;
496   }
497 
498   return batchP;
499 }
BGenBankInit(void)500 BGenBankPtr BGenBankInit(void)
501 {
502     BGenBankPtr bgbp;
503     AsnModulePtr amp;
504 
505     AsnTypePtr       atp_bioseq_set;
506     AsnTypePtr       atp_bioseq_set_level;
507     AsnTypePtr       atp_bioseq_set_class;
508     AsnTypePtr       atp_bioseq_set_release;
509     AsnTypePtr       atp_bioseq_set_date;
510     AsnTypePtr       atp_bioseq_set_seq_set;
511     AsnTypePtr       atp_bioseq_set_seq_set_E;
512 
513     Char  release[] = "Nbatch Dump";
514     Char  date_time[128];
515     NCBI_Date       date={{0,0,0,0,255,255,255},NULL};
516     DataVal dv;
517 
518     bgbp = (BGenBank*)MemNew(sizeof(BGenBank));
519     bgbp->aip = AsnIoNew(ASNIO_TEXT_OUT, stdout, NULL, NULL, NULL);
520 
521     amp = AsnAllModPtr();
522 
523     MACRO_atp_find(atp_bioseq_set,Bioseq-set);
524     MACRO_atp_find(atp_bioseq_set_level,Bioseq-set.level);
525     MACRO_atp_find(atp_bioseq_set_class,Bioseq-set.class);
526     MACRO_atp_find(atp_bioseq_set_release,Bioseq-set.release);
527     MACRO_atp_find(atp_bioseq_set_date,Bioseq-set.date);
528     MACRO_atp_find(atp_bioseq_set_seq_set,Bioseq-set.seq-set);
529     MACRO_atp_find(atp_bioseq_set_seq_set_E,Bioseq-set.seq-set.E);
530 
531     if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set,NULL))
532         return NULL;
533     dv.intvalue = 0;
534 
535     if(!AsnWrite(bgbp->aip,atp_bioseq_set_level,&dv))
536         return NULL;
537     dv.intvalue = 7;
538 
539     if(!AsnWrite(bgbp->aip,atp_bioseq_set_class,&dv))
540         return NULL;
541     dv.ptrvalue = release;
542 
543     if(!AsnWrite(bgbp->aip,atp_bioseq_set_release,&dv))
544         return NULL;
545 
546     Nlm_DayTimeStr(date_time,TRUE,TRUE);
547     date.str=date_time;
548 
549     if(!DateAsnWrite(&date,bgbp->aip,atp_bioseq_set_date))
550         return NULL;
551 
552     if(!AsnOpenStruct(bgbp->aip,atp_bioseq_set_seq_set,NULL))
553         return NULL;
554     AsnIoFlush(bgbp->aip);
555 
556     bgbp->atp =atp_bioseq_set_seq_set_E;
557     bgbp->atp_bioseq_set_seq_set = atp_bioseq_set_seq_set;
558     bgbp->atp_bioseq_set = atp_bioseq_set;
559 
560     return bgbp;
561 }
BGenBankClose(BGenBankPtr bgbp)562 void BGenBankClose(BGenBankPtr bgbp)
563 {
564     AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set_seq_set,NULL);
565     AsnCloseStruct(bgbp->aip, bgbp->atp_bioseq_set,NULL);
566 
567     AsnIoClose(bgbp->aip);
568     MemFree(bgbp);
569     return;
570 }
571 
Main()572 Int2 Main ()
573 {
574   Int4 i, gi;
575   ByteStorePtr gis_bsp;
576   Int4 NumGis;
577   BatchParamPtr batchP = NULL;
578   BatchAccListPtr AccList, AccTmp;
579   Int4 TotalNumAcc =0;
580   time_t time_now;
581   CharPtr TimeNowStr;
582   WWWInfoPtr info;
583   WWWErrorCode error;
584   FILE *log_file = NULL;
585   BGenBankPtr  bgbp = NULL;
586 
587   if((error = WWWReadPosting(&info)) != WWWErrOk) {
588     ErrLogPrintf("Error in processing WWW request\n");
589     return 1;
590   }
591 
592   if(WWWGetMethod(info) != COMMAND_LINE) {
593       if( !ErrSetLogfile (LogFile, ELOG_APPEND) ) {
594           return 1;
595       }
596       ErrSetLogLevel(SEV_MAX);
597       log_file = FileOpen(LogFile, "a");
598   }
599 
600   if(WWWGetMethod(info) == COMMAND_LINE) {
601       if((batchP = MakeCommandLineParameters()) == NULL) {
602           return 1;
603       }
604       batchP->CommandLine = TRUE;
605   } else if (WWWGetMethod(info) == WWW_GET) {
606       time_now = GetSecs();
607       TimeNowStr = ctime(&time_now);
608       TimeNowStr[24] = '\0';
609       fprintf(log_file, "\n%s|%s|%s|%s|%d",
610               TimeNowStr, WWWGetAddress(info),
611               WWWGetHost(info), WWWGetAgent(info), 0);
612 
613       if(WWWGetBrowser(info) != NETSCAPE)
614           WWWSendBatchPage(0);
615       else
616           WWWSendBatchPage(1);
617 
618       FileClose(log_file);
619       return 1;
620   } else { /* method == POST */
621 
622       if((batchP = MakeBatchParameters(WWWGetWWWEntries(info),
623                                        WWWGetNumEntries(info))) == NULL) {
624           printf("Content-type: text/html\r\n\r\n");
625           printf("Error in creating BATCH parameters");
626           FileClose(log_file);
627           return 1;
628       }
629   }
630 
631   time_now = GetSecs();
632   TimeNowStr = ctime(&time_now);
633   TimeNowStr[24] = '\0';
634   if(!batchP->CommandLine) {
635       fprintf(log_file, "\n%s|%s|%s|%s|%d|%d|%d|%d|%d|%d|%s|%d|",
636               TimeNowStr, WWWGetAddress(info),
637               WWWGetHost(info), WWWGetAgent(info), (int)1,
638               (int)batchP->format, (int)batchP->html, (int)batchP->dump,
639               (int)batchP->request, (int)batchP->sequence,
640               (batchP->organism[0] == NULLB) ? "(null)" : batchP->organism,
641               (int)(batchP->file[0] == NULLB ? 0 : 1));
642   }
643   FileClose(log_file);
644 
645   if((batchP->request == REQ_ORG) && (StringLen(batchP->organism) < 3)) {
646       if(!batchP->CommandLine) {
647           printf("<TITLE>Batch Entrez Results</TITLE>");
648           printf("%s%c<BR><BR><BR><b>", BatchResultsTitle, LF);
649       }
650       printf("ERROR: Length of organism name must be more "
651              "than 2 characters\n");
652       if(!batchP->CommandLine)
653           printf("</b>");
654       return 1;
655   }
656   if((batchP->file[0] == NULLB) && (batchP->request != REQ_ORG)) {
657       if(!batchP->CommandLine) {
658           printf("Content-type: text/html\r\n\r\n");
659           printf("<TITLE>Batch Entrez Results</TITLE>");
660           printf("%s%c<BR><BR><BR><b>", BatchResultsTitle, LF);
661       }
662       printf("ERROR: You did not entered filename "
663              "with gis/accessions or file may be empty.\n");
664       if(!batchP->CommandLine)
665           printf("</b>");
666       return 1;
667   }
668 
669   if(!batchP->CommandLine) {
670 
671       printf("HTTP/1.0 200 OK\r\n");
672       printf("MIME-Version: 1.0\r\n");
673 
674       if(batchP->dump) {
675           printf("Content-type: application/octet-stream\r\n\r\n");
676       } else {
677           printf("Content-type: text/html\r\n\r\n");
678           printf("<TITLE>Batch Entrez Results</TITLE>");
679           printf("%s%c", BatchResultsTitle, LF);
680       }
681   }
682 
683 #ifdef DEBUG_MODE
684   printf("Content-type: text/html\r\n\r\n");
685   for(i=0; i < m; i++) {
686       printf("%s : %s\n <BR>%c", entries[i]->name, entries[i]->val, LF);
687   }
688   exit(1);
689 #endif
690 
691   putenv("USER=BatchEntrez");
692 
693   if(batchP->html) {
694       init_www(); /* initializing WWW mode */
695       head_tail_ff(NULL, BatchHead, BatchTail);
696   }
697 
698   if(!batchP->dump || batchP->html)
699       printf("<PRE>"); /* Entering text mode */
700 
701   /* Starting up connection to Entrez */
702 
703   if (! EntrezInit("BatchEntrez", FALSE, NULL)) {
704       printf("Cannot initialize Entrez\r\n");
705       return 1;
706   }
707 
708   EntrezBioseqFetchEnable("BatchEntrez", TRUE);
709 
710   /* Finding all gi's of given organism */
711 
712   if(batchP->request == REQ_ORG) {
713       if((gis_bsp = GetGisFromOrg(batchP->organism,
714                                   &NumGis, batchP->sequence)) == NULL) {
715           printf("**** ERROR: Lookup for GI's of organism \"%s\" failed\r\n",
716                  batchP->organism);
717           return 1;
718       } else {
719           if(!batchP->dump || (batchP->format != F_FASTA))
720               printf("**** %d gi's found for organism \"%s\"\r\n\r\n",
721                      (int)NumGis, batchP->organism);
722           /* setting limit of printing */
723           if(WWWGetMethod(info) != COMMAND_LINE &&
724              NumGis > (batchP->format == F_DLIST ? SEARCH_DLIMIT : SEARCH_LIMIT)) {
725               printf("**** The number of retrieved GIs (%d) "
726                      "exceeds the current limit of %d.\r\n"
727                      "**** Only the list of GIs will be downloaded "
728                      "to your computer.  Please divide\r\n"
729                      "**** this list into smaller parts and submit "
730                      "them separately.\r\n\r\n",
731                      (int)NumGis, (int)(batchP->format == F_DLIST ?
732                               SEARCH_DLIMIT : SEARCH_LIMIT));
733               batchP->format = F_GILIST;
734           }
735       }
736       if(batchP->format != F_GILIST && batchP->html)
737           printf("<HR>");
738 
739       if(batchP->format == F_ASN1_GENB)
740           bgbp = BGenBankInit();
741 
742       for(i= 0; i < NumGis; i++) {
743           BSRead(gis_bsp, &gi, sizeof(Int4));
744           if(!PrintGi(gi, batchP->format, stdout,
745                       batchP->sequence, batchP->single_entry, bgbp)) {
746               printf("Error in printing gi %d\r\n", (int) gi);
747           }
748           if(batchP->html)
749               printf("<HR>");
750       }
751 
752       if(batchP->format == F_ASN1_GENB)
753           BGenBankClose(bgbp);
754 
755   } else {
756 
757       /* Printing all sequences from list of accessions in user's file */
758 
759       if((AccList = GetAccList(batchP, &TotalNumAcc)) == NULL) {
760           printf("**** ERROR: No valid Gis/Accessions found\r\n");
761           return 1;
762       }
763 
764       if(batchP->format != F_ASN1_GENB &&
765          batchP->format != F_FASTA &&
766          batchP->format != F_ASN1) {
767           if(!batchP->dump && !batchP->id_lookup)
768               printf("**** %d validly formatted Gis/Accessions present in "
769                      "Entrez-batch request\r\n\r\n", (int)TotalNumAcc);
770       }
771 
772       if(WWWGetMethod(info) != COMMAND_LINE) {
773           if(TotalNumAcc > (batchP->format == F_DLIST ?
774                             SEARCH_DLIMIT : SEARCH_LIMIT)) {
775               printf("**** The number of retrieved Gis/Accessions (%d) "
776                      "exceeds the current limit of %d.\r\n"
777                      "**** Please divide this list into smaller parts and submit "
778                      "them separately.\r\n\r\n",
779                      (int)TotalNumAcc, (int)(batchP->format == F_DLIST ?
780                                    SEARCH_DLIMIT : SEARCH_LIMIT));
781               return 1;
782           }
783       }
784       if(batchP->html)
785           printf("<HR>");
786 
787       if(batchP->format == F_ASN1_GENB)
788           bgbp = BGenBankInit();
789 
790       for (AccTmp = AccList; AccTmp != NULL; AccTmp = AccTmp->next) {
791           if(!PrintGi(AccTmp->gi, batchP->format,
792                       stdout, batchP->sequence, batchP->single_entry, bgbp)) {
793               printf("**** WARNING: Printing Gi %d (%s) failed \r\n\r\n",
794                      (int)AccTmp->gi, AccTmp->acc);
795           }
796 
797           if(batchP->html)
798               printf("<HR>");
799       }
800       if(batchP->format == F_ASN1_GENB)
801           BGenBankClose(bgbp);
802   }
803 
804   /* terminating Entrez connection */
805 
806   if(!batchP->CommandLine)
807       printf("\r\n\r\n**** Transfer completed successfuly -------=-*\r\n\r\n");
808 
809   EntrezFini();
810   EntrezBioseqFetchDisable();
811   fflush(stdout);
812 
813   Nlm_FreeArgs(NUMARGS, dump_args);
814 
815   return 0;
816 }
817 
818 
MakeBatchParameters(WWWEntryPtr PNTR entries,Int4 m)819 static BatchParamPtr MakeBatchParameters(WWWEntryPtr PNTR entries, Int4 m)
820 {
821   Int4 i;
822   BatchParamPtr batchP;
823   Boolean OrganismSet = FALSE;
824 
825   batchP = (BatchParamPtr) MemNew(sizeof(BatchParam));
826 
827   batchP->format   = F_GEN;
828   batchP->dump     = FALSE;
829   batchP->request  = REQ_ORG;
830   batchP->sequence = BSEQ_NA;
831   batchP->organism = NULLB;
832   batchP->file     = NULLB;
833   batchP->html     = FALSE;
834   batchP->single_entry  = TRUE;
835 
836   for(i=0; i <  m; i++) {
837 
838     /* FORMAT */
839 
840     if (!StringICmp(entries[i]->name, "FORMAT")) {
841       if (!StringICmp(entries[i]->val, "GenBank/GenPept"))
842         batchP->format = F_GEN;
843       else if (!StringICmp(entries[i]->val, "FASTA"))
844         batchP->format = F_FASTA;
845       else if (!StringICmp(entries[i]->val, "ASN.1"))
846         batchP->format = F_ASN1;
847       else if (!StringICmp(entries[i]->val, "List of GIs"))
848         batchP->format = F_GILIST;
849       else if (!StringICmp(entries[i]->val, "List of Deflines"))
850         batchP->format = F_DLIST;
851 
852       /* DUMP TYPE */
853 
854     } else if (!StringICmp(entries[i]->name, "DUMP_TYPE")) {
855       batchP->dump = TRUE;
856     } else if (!StringICmp(entries[i]->name, "RETURN_ALL_SET")) {
857       batchP->single_entry = FALSE;
858 
859       /* REQUEST TYPE */
860 
861     } else if (!StringICmp(entries[i]->name, "REQUEST_TYPE")) {
862       if (!StringICmp(entries[i]->val, "ORGANISM"))
863         batchP->request = REQ_ORG;
864       else if (!StringICmp(entries[i]->val, "FILESUBMIT"))
865         batchP->request = REQ_LIST;
866 
867       /* SEQUENCE TYPE */
868 
869     } else if (!StringICmp(entries[i]->name, "SEQ_TYPE")) {
870       if (!StringICmp(entries[i]->val, "nucleotide"))
871         batchP->sequence = BSEQ_NA;
872       else if (!StringICmp(entries[i]->val, "protein"))
873         batchP->sequence = BSEQ_AA;
874 
875       /* ORGANISM NAME */
876 
877     } else if (!StringICmp(entries[i]->name, "ORGNAME")) {
878       batchP->organism = entries[i]->val;
879       if(entries[i]->val[0] != NULLB)
880         OrganismSet = TRUE;
881     } else if (!StringICmp(entries[i]->name, "LIST_ORG") && !OrganismSet) {
882       if (StringICmp(entries[i]->val, "(None)"))
883         batchP->organism = entries[i]->val;
884 
885       /* USER FILE */
886 
887     } else if (!StringICmp(entries[i]->name, "USERFILE")) {
888       batchP->file = entries[i]->val;
889     } else if (!StringICmp(entries[i]->name, "HTML")) {
890       batchP->html = TRUE;
891     }
892   } /* for (i=0... */
893 
894   return batchP;
895 
896 }
897 
AccessionToGi(CharPtr string,Int4Ptr PNTR giptr,Int2 seqtype)898 static Int4 AccessionToGi(CharPtr string,  Int4Ptr PNTR giptr, Int2 seqtype)
899 {
900     Int4 i, gi = 0;
901     CharPtr str;
902     ByteStorePtr bsp;
903     Int4 GiNum;
904 
905     if(string == NULL)
906         return 0;
907 
908     if((gi = atol(string)) > 0) { /* He-he this is gi... */
909         *giptr = (Int4Ptr) MemNew(sizeof(Int4));
910         (*giptr)[0] = gi;
911         return 1;
912     }
913     str = (CharPtr) MemNew(StringLen(string)+30);
914     sprintf(str, "\"%s\"[ACCN]", string);
915 
916     if((bsp = EntrezTLEvalXString(str, seqtype,
917                                   -1, NULL, NULL)) == NULL) {
918         ErrLogPrintf("Failure to parse input string");
919         return 0;
920     }
921     MemFree(str);
922 
923     BSSeek(bsp, 0L, 0);
924 
925     if((GiNum = BSLen(bsp)/sizeof(DocUid)) < 1) {
926         /* Every accession must have one and ONLY ONE gi */
927         return 0;
928     }
929     *giptr = (Int4Ptr) MemNew(sizeof(Int4)*GiNum);
930     for(i = 0; i < GiNum; i++) {
931         BSRead(bsp, &gi, sizeof(Int4));
932         (*giptr)[i] = gi;
933     }
934 
935     BSFree(bsp);
936     return GiNum;
937 }
938 
PrintGi(Int4 gi,Int4 format,FILE * fd,Int4 seq_type,Int4 single_entry,BGenBankPtr bgbp)939 static Boolean PrintGi(Int4 gi, Int4 format, FILE *fd,
940                        Int4 seq_type, Int4 single_entry,
941                        BGenBankPtr bgbp)
942 {
943     SeqEntryPtr sep, sep_all;
944     AsnIoPtr aip;
945     Boolean retvalue = TRUE;
946     Boolean is_na = (Boolean)(seq_type == BSEQ_NA);
947     SeqIdPtr sip = NULL;
948     BioseqPtr bsp;
949 
950     if(format == F_GILIST) {
951         fprintf(fd, "%d\r\n", (int)gi);
952         return TRUE;
953     }
954 
955     if((sep_all = EntrezSeqEntryGet(gi, single_entry)) == NULL) {
956         printf("**** WARNING: NULL Seq Entry pointer - "
957                "request failed for gi %d\r\n", (int)gi);
958         return FALSE;
959     }
960 
961     if(single_entry) {
962         ObjMgrRegister(OBJ_SEQENTRY, sep_all);
963         sip = ValNodeNew(NULL);
964         sip->choice = SEQID_GI;
965         sip->data.intvalue = gi;
966 
967         if((bsp = BioseqFind(sip)) == NULL) {
968             printf("**** WARNING: NULL Bioseq pointer - "
969                    "request failed for gi %d\r\n", (int)gi);
970             return FALSE;
971         }
972         sep = SeqEntryNew();
973         sep->choice = 1; /* Bioseq */
974         sep->data.ptrvalue = bsp;
975     } else {
976         sep = sep_all;
977     }
978 
979     switch(format) {
980     case F_GEN:
981         if(!SeqEntryToFlatEx(sep_all, fd,
982                              (Uint1) (is_na ? GENBANK_FMT : GENPEPT_FMT),
983                              RELEASE_MODE, sip, FF_REGULAR)) {
984 
985             if(!SeqEntryToFlatEx(sep_all, fd,
986                                  (Uint1)(is_na ? GENPEPT_FMT : GENBANK_FMT),
987                                  RELEASE_MODE, sip, FF_REGULAR)) {
988                 retvalue = FALSE;
989             }
990         }
991         break;
992     case F_FASTA:
993         if(!SeqEntryToFasta(sep, fd, is_na)) {
994             if(!SeqEntryToFasta(sep, fd, (Uint1)!is_na)) {
995                 printf("**** WARNING: Printing of FASTA format "
996                        "(gi=%d) failed\r\n", (int)gi);
997                 retvalue = FALSE;
998             }
999         }
1000         break;
1001     case F_DLIST:
1002 
1003         if (IS_Bioseq(sep))
1004             retvalue = SeqEntrysToDefline(sep, fd, is_na, 3);
1005         else
1006             retvalue = SeqEntrysToDefline(sep, fd, is_na, 0);
1007 
1008         if(retvalue == FALSE) {
1009             if (IS_Bioseq(sep))
1010                 retvalue = SeqEntrysToDefline(sep, fd, (Uint1) !is_na, 3);
1011             else
1012                 retvalue = SeqEntrysToDefline(sep, fd, (Uint1) !is_na, 0);
1013         }
1014 
1015         break;
1016 
1017     case F_ASN1:
1018         aip = AsnIoNew(ASNIO_TEXT_OUT, fd, NULL, NULL, NULL);
1019         retvalue = SeqEntryAsnWrite(sep, aip, NULL);
1020         AsnIoClose(aip);
1021         break;
1022     case F_ASN1_GENB:
1023         retvalue = SeqEntryAsnWrite(sep, bgbp->aip, bgbp->atp);
1024         break;
1025     default:
1026         return FALSE;
1027     }
1028 
1029 
1030     if(single_entry) {
1031         SeqIdFree(sip);
1032         MemFree(sep);
1033     }
1034 
1035     SeqEntryFree(sep_all);
1036 
1037     return retvalue;
1038 }
1039 
1040 
GetGisFromOrg(CharPtr org,Int4Ptr GiNum,Int4 seqtype)1041 static ByteStorePtr GetGisFromOrg(CharPtr org, Int4Ptr GiNum, Int4 seqtype)
1042 {
1043   ByteStorePtr bsp;
1044   CharPtr str;
1045 
1046   str = (CharPtr) MemNew(StringLen(org)+30);
1047   sprintf(str, "\"%s\"[ORGN]", org);
1048 
1049   if((bsp = EntrezTLEvalXString(str, (Int2) seqtype, -1, NULL, NULL)) == NULL) {
1050     ErrLogPrintf("Failure to parse input string");
1051     return NULL;
1052   }
1053   BSSeek(bsp, 0L, 0);
1054   *GiNum = BSLen(bsp)/sizeof(DocUid);
1055   return bsp;
1056 }
1057 
GetAccList(BatchParamPtr batchP,Int4Ptr TotalItems)1058 static BatchAccListPtr GetAccList(BatchParamPtr batchP,
1059                                   Int4Ptr TotalItems)
1060 {
1061     Char TmpBuff[16];
1062     Int4 i, j, k;
1063     Int4 FileLen = 0;
1064     BatchAccListPtr AccList = NULL;
1065     BatchAccListPtr AccListTmp, AccListLast;
1066     Int4 NumNotValid = 0;
1067     Int4 NumGis;
1068     Int4Ptr giptr;
1069     CharPtr file;
1070     Int2 seqtype;
1071 
1072     if (!batchP  ||  (file = batchP->file) == NULL  ||  !file[0]) {
1073       *TotalItems = 0;
1074       return NULL;
1075     }
1076     seqtype = batchP->sequence;
1077 
1078     FileLen = StringLen(file);
1079 
1080     for(i = 0; i < FileLen; i++) {
1081 
1082         if(isspace(file[i]) || file[i] == ',') /* Rolling spaces */
1083             continue;
1084 
1085         /* This is defence from badly formatted requests */
1086 
1087         if(!batchP->CommandLine && NumNotValid > 10) {
1088             printf("**** ERROR: Too many invalid Gis/Accessions, "
1089                    "parsing aborted\n");
1090             *TotalItems = 0;
1091             return NULL;
1092         }
1093 
1094         /* Rolling spaces */
1095 
1096         j= 0;
1097         while (!isspace(file[i]) && j < 10  && i < FileLen) {
1098             TmpBuff[j] = file[i];
1099             j++; i++;
1100             if(file[i] == ',')  /* Comma is valid delimiter */
1101                 break;
1102         }
1103         TmpBuff[j] = NULLB;
1104 
1105 
1106         /* Ignore strings like ">Protein" */
1107 
1108         if(j > 0 && TmpBuff[0] == '>' && IS_ALPHA(TmpBuff[1]))
1109             continue;
1110 
1111         /* Is gi/accession too long ??? */
1112 
1113         if(j == 10) {
1114             printf("**** WARNING: Gi/Accession \"%s\" is too long\r\n",
1115                    TmpBuff);
1116             NumNotValid++;
1117 
1118             while(!isspace(file[i]) ||
1119                   file[i] == ',' ||
1120                   file[i] == NULLB) /* Rolling until spaces */
1121                 i++;
1122             continue;  /* Next may be valid ... who knows...?? */
1123         }
1124 
1125         /* Now validating accession/gi */
1126 
1127         for(k =0; k < j; k++) {
1128             if(!isdigit(TmpBuff[k])) {
1129                 break;
1130             }
1131         }
1132         if(k != j) {
1133             if(!IS_ntdb_accession(TmpBuff) && !IS_protdb_accession(TmpBuff)) {
1134                 printf("**** WARNING: Gi/Accession \"%s\" is not valid\r\n",
1135                        TmpBuff);
1136                 NumNotValid++;
1137                 continue;
1138             }
1139         }
1140 
1141         /* If this is valid Accession check and tranfer it to gi */
1142 
1143         giptr = NULL;
1144         if((NumGis = AccessionToGi(TmpBuff, &giptr, seqtype)) == 0) {
1145             printf("**** WARNING: Gi/Accession %s is not found "
1146                    "in database----\r\n",
1147                    TmpBuff);
1148             NumNotValid++;
1149             continue;
1150         } else {
1151             for (j = 0; j < NumGis; j++) {
1152                 /* It we come here - we got valid text ID */
1153 
1154                 if(AccList == NULL) { /* first element */
1155                     AccList = (BatchAccListPtr) MemNew(sizeof(BatchAccList));
1156                     AccListTmp = AccList;
1157                     AccListTmp->acc = StringSave(TmpBuff);
1158                     AccListTmp->gi = giptr[j];
1159                     AccListTmp->next = NULL;
1160                     AccListLast=AccListTmp;
1161                     *TotalItems = *TotalItems +1;
1162                 } else {
1163                     AccListTmp = (BatchAccListPtr)
1164                         MemNew(sizeof(BatchAccList));
1165                     AccListLast->next = AccListTmp;
1166                     AccListTmp->acc = StringSave(TmpBuff);
1167                     AccListTmp->gi = giptr[j];
1168                     AccListTmp->next = NULL;
1169                     AccListLast = AccListTmp;
1170                     *TotalItems = *TotalItems +1;
1171                 }
1172             }
1173             MemFree(giptr);
1174         }
1175     }
1176     if(NumNotValid) {
1177         printf("**** %d invalid Gi%s/Accession%s present in Entrez-batch "
1178                "request\r\n",
1179                (int)NumNotValid,
1180                NumNotValid == 1 ? "" : "s",
1181                NumNotValid == 1 ? "" : "s"
1182                );
1183     }
1184     return AccList;
1185 }
BatchHead(VoidPtr pointer,FILE * fd)1186 void BatchHead(VoidPtr pointer, FILE *fd)
1187 {
1188   return;
1189 }
BatchTail(VoidPtr pointer,FILE * fd)1190 void BatchTail(VoidPtr pointer, FILE *fd)
1191 {
1192   return;
1193 }
1194 
1195