1 /* insdseqget.c
2 * ===========================================================================
3 *
4 * PUBLIC DOMAIN NOTICE
5 * National Center for Biotechnology Information (NCBI)
6 *
7 * This software/database is a "United States Government Work" under the
8 * terms of the United States Copyright Act. It was written as part of
9 * the author's official duties as a United States Government employee and
10 * thus cannot be copyrighted. This software/database is freely available
11 * to the public for use. The National Library of Medicine and the U.S.
12 * Government do not place any restriction on its use or reproduction.
13 * We would, however, appreciate having the NCBI and the author cited in
14 * any work or product based on this material
15 *
16 * Although all reasonable efforts have been taken to ensure the accuracy
17 * and reliability of the software and data, the NLM and the U.S.
18 * Government do not and cannot warrant the performance or results that
19 * may be obtained by using this software or data. The NLM and the U.S.
20 * Government disclaim all warranties, express or implied, including
21 * warranties of performance, merchantability or fitness for any particular
22 * purpose.
23 *
24 * ===========================================================================
25 *
26 * File Name: insdseqget.c
27 *
28 * Author: Jonathan Kans
29 *
30 * Version Creation Date: 11/4/02
31 *
32 * $Revision: 1.2 $
33 *
34 * File Description: Demo to fetch by accession, write INSDSet XML
35 *
36 * Modifications:
37 * --------------------------------------------------------------------------
38 * ==========================================================================
39 */
40
41 #include <ncbi.h>
42 #include <objall.h>
43 #include <objsset.h>
44 #include <objsub.h>
45 #include <objfdef.h>
46 #include <objgbseq.h>
47 #include <objinsdseq.h>
48 #include <seqport.h>
49 #include <sequtil.h>
50 #include <sqnutils.h>
51 #include <subutil.h>
52 #include <tofasta.h>
53 #include <explore.h>
54 #include <ent2api.h>
55 #include <pmfapi.h>
56 #include <asn2gnbp.h>
57
58 #define INSDSEQGET_APP_VER "1.1"
59
60 CharPtr INSDSEQGET_APPLICATION = INSDSEQGET_APP_VER;
61
ReadALine(CharPtr str,size_t size,FILE * fp)62 static CharPtr ReadALine (
63 CharPtr str,
64 size_t size,
65 FILE *fp
66 )
67
68 {
69 Char ch;
70 CharPtr ptr;
71 CharPtr rsult;
72
73 if (str == NULL || size < 1 || fp == NULL) return NULL;
74 *str = '\0';
75 rsult = FileGets (str, size, fp);
76 if (rsult != NULL) {
77 ptr = str;
78 ch = *ptr;
79 while (ch != '\0' && ch != '\n' && ch != '\r') {
80 ptr++;
81 ch = *ptr;
82 }
83 *ptr = '\0';
84 }
85 return rsult;
86 }
87
88 typedef struct lookforids {
89 Boolean isGED;
90 Boolean isNTorNW;
91 Boolean isNC;
92 Boolean isTPA;
93 Boolean isNuc;
94 Boolean isProt;
95 } LookForIDs, PNTR LookForIDsPtr;
96
LookForSeqIDs(BioseqPtr bsp,Pointer userdata)97 static void LookForSeqIDs (BioseqPtr bsp, Pointer userdata)
98
99 {
100 LookForIDsPtr lfip;
101 SeqIdPtr sip;
102 TextSeqIdPtr tsip;
103
104 lfip = (LookForIDsPtr) userdata;
105 if (ISA_na (bsp->mol)) {
106 lfip->isNuc = TRUE;
107 }
108 if (ISA_aa (bsp->mol)) {
109 lfip->isProt = TRUE;
110 }
111 for (sip = bsp->id; sip != NULL; sip = sip->next) {
112 switch (sip->choice) {
113 case SEQID_GENBANK :
114 case SEQID_EMBL :
115 case SEQID_DDBJ :
116 lfip->isGED = TRUE;
117 break;
118 case SEQID_TPG :
119 case SEQID_TPE :
120 case SEQID_TPD :
121 lfip->isTPA = TRUE;
122 break;
123 case SEQID_OTHER :
124 tsip = (TextSeqIdPtr) sip->data.ptrvalue;
125 if (tsip != NULL) {
126 if (StringNCmp (tsip->accession, "NC_", 3) == 0) {
127 lfip->isNC = TRUE;
128 } else if (StringNCmp (tsip->accession, "NT_", 3) == 0) {
129 lfip->isNTorNW = TRUE;
130 } else if (StringNCmp (tsip->accession, "NW_", 3) == 0) {
131 lfip->isNTorNW = TRUE;
132 }
133 }
134 break;
135 default :
136 break;
137 }
138 }
139 }
140
LookForGEDetc(SeqEntryPtr topsep,BoolPtr isGED,BoolPtr isNTorNW,BoolPtr isNC,BoolPtr isTPA,BoolPtr isNuc,BoolPtr isProt)141 static void LookForGEDetc (
142 SeqEntryPtr topsep,
143 BoolPtr isGED,
144 BoolPtr isNTorNW,
145 BoolPtr isNC,
146 BoolPtr isTPA,
147 BoolPtr isNuc,
148 BoolPtr isProt
149 )
150
151 {
152 LookForIDs lfi;
153
154 MemSet ((Pointer) &lfi, 0, sizeof (LookForIDs));
155 VisitBioseqsInSep (topsep, (Pointer) &lfi, LookForSeqIDs);
156 *isGED = lfi.isGED;
157 *isNTorNW = lfi.isNTorNW;
158 *isNC = lfi.isNC;
159 *isTPA = lfi.isTPA;
160 *isNuc = lfi.isNuc;
161 *isProt = lfi.isProt;
162 }
163
DoSeqEntryToGnbk(SeqEntryPtr sep,FmtType fmt,XtraPtr extra)164 static void DoSeqEntryToGnbk (
165 SeqEntryPtr sep,
166 FmtType fmt,
167 XtraPtr extra
168 )
169
170 {
171 CstType cust = SHOW_TRANCRIPTION | SHOW_PEPTIDE;
172 FlgType flags = SHOW_FAR_TRANSLATION | SHOW_CONTIG_AND_SEQ;
173 Boolean isGED;
174 Boolean isNTorNW;
175 Boolean isNC;
176 Boolean isNuc;
177 Boolean isProt;
178 Boolean isTPA;
179 LckType locks = LOOKUP_FAR_COMPONENTS | LOOKUP_FAR_LOCATIONS | LOOKUP_FAR_PRODUCTS;
180
181 LookForGEDetc (sep, &isGED, &isNTorNW, &isNC, &isTPA, &isNuc, &isProt);
182
183 if (fmt == GENBANK_FMT && (! isNuc)) return;
184 if (fmt == GENPEPT_FMT && (! isProt)) return;
185
186 if (isNTorNW || isTPA) {
187 flags |= ONLY_NEAR_FEATURES;
188 } else if (isNC) {
189 flags |= NEAR_FEATURES_SUPPRESS;
190 }
191
192 SeqEntryToGnbk (sep, NULL, fmt, ENTREZ_MODE, SEGMENT_STYLE,
193 flags, locks, cust, extra, NULL);
194 }
195
DoQuery(FILE * fp,FILE * dfp,XtraPtr extra,Boolean get_var,Boolean do_nuc,Boolean do_prot)196 static void DoQuery (
197 FILE *fp,
198 FILE *dfp,
199 XtraPtr extra,
200 Boolean get_var,
201 Boolean do_nuc,
202 Boolean do_prot
203 )
204
205 {
206 Entrez2BooleanReplyPtr e2br;
207 Entrez2IdListPtr e2lp;
208 Entrez2RequestPtr e2rq;
209 Entrez2ReplyPtr e2ry;
210 Int4 flags = 0;
211 Int4 i;
212 Char line [256];
213 E2ReplyPtr reply;
214 SeqEntryPtr sep;
215 CharPtr str;
216 Uint4 uid;
217
218 if (get_var) {
219 flags = 1;
220 }
221
222 e2rq = EntrezCreateBooleanRequest (TRUE, FALSE, "Nucleotide", NULL, 0, 0, NULL, 0, 0);
223 if (e2rq == NULL) return;
224
225 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
226 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
227
228 str = ReadALine (line, sizeof (line), fp);
229 if (! StringHasNoText (str)) {
230 EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
231 }
232
233 while (str != NULL) {
234 if (! StringHasNoText (str)) {
235 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
236 EntrezAddToBooleanRequest (e2rq, NULL, 0, "ACCN", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
237 }
238 str = ReadALine (line, sizeof (line), fp);
239 }
240
241 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
242 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_AND, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
243 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_LEFT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
244
245 str = ReadALine (line, sizeof (line), dfp);
246 if (! StringHasNoText (str)) {
247 EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
248 }
249
250 while (str != NULL) {
251 if (! StringHasNoText (str)) {
252 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_OR, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
253 EntrezAddToBooleanRequest (e2rq, NULL, 0, "MDAT", str, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
254 }
255 str = ReadALine (line, sizeof (line), dfp);
256 }
257
258 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
259 EntrezAddToBooleanRequest (e2rq, NULL, ENTREZ_OP_RIGHT_PAREN, NULL, NULL, NULL, 0, 0, NULL, NULL, TRUE, TRUE);
260
261 e2ry = EntrezSynchronousQuery (e2rq);
262 e2rq = Entrez2RequestFree (e2rq);
263
264 if (e2ry == NULL) return;
265 reply = e2ry->reply;
266 if (reply == NULL || reply->choice != E2Reply_eval_boolean) return;
267 e2br = EntrezExtractBooleanReply (e2ry);
268 if (e2br == NULL) return;
269
270 e2lp = e2br->uids;
271 if (e2lp != NULL) {
272 BSSeek (e2lp->uids, 0, SEEK_SET);
273 for (i = 0; i < e2lp->num; i++) {
274 uid = Nlm_BSGetUint4 (e2lp->uids);
275 if (uid < 1) continue;
276
277 sep = PubSeqSynchronousQuery (uid, 0, flags);
278 if (sep == NULL) continue;
279
280 if (do_nuc) {
281 DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
282 }
283 if (do_prot) {
284 DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
285 }
286
287 SeqEntryFree (sep);
288 }
289 }
290
291 Entrez2BooleanReplyFree (e2br);
292 }
293
ProcessAccession(CharPtr accn,XtraPtr extra,Boolean only_new,Boolean get_var,Boolean do_nuc,Boolean do_prot)294 static void ProcessAccession (
295 CharPtr accn,
296 XtraPtr extra,
297 Boolean only_new,
298 Boolean get_var,
299 Boolean do_nuc,
300 Boolean do_prot
301 )
302
303 {
304 Char ch;
305 Int4 flags = 0;
306 Int4 gi = 0;
307 Char id [41];
308 Boolean is_numeric = TRUE;
309 Int4 newgi = 0;
310 CharPtr ptr;
311 SeqEntryPtr sep;
312 SeqIdPtr sip;
313 Char tmp [41];
314 long val;
315
316 ptr = accn;
317 ch = *ptr;
318 while (ch != '\0' && is_numeric) {
319 if (! IS_DIGIT (ch)) {
320 is_numeric = FALSE;
321 }
322 ptr++;
323 ch = *ptr;
324 }
325
326 if (is_numeric) {
327 if (sscanf (accn, "%ld", &val) == 1) {
328 gi = (Int4) val;
329 if (gi < 1) return;
330 if (only_new) {
331 sip = GetSeqIdForGI (gi);
332 if (sip != NULL) {
333 SeqIdWrite (sip, tmp, PRINTID_TEXTID_ACC_VER, sizeof (tmp));
334 SeqIdFree (sip);
335 ptr = StringChr (tmp, '.');
336 if (ptr != NULL) {
337 *ptr = '\0';
338 sip = SeqIdFromAccessionDotVersion (tmp);
339 newgi = GetGIForSeqId (sip);
340 SeqIdFree (sip);
341 if (newgi == gi) return;
342 }
343 }
344 }
345 }
346 } else {
347 sip = SeqIdFromAccessionDotVersion (accn);
348 gi = GetGIForSeqId (sip);
349 SeqIdFree (sip);
350 if (only_new) {
351 sip = GetSeqIdForGI (gi);
352 if (sip != NULL) {
353 SeqIdWrite (sip, id, PRINTID_TEXTID_ACC_VER, sizeof (id));
354 SeqIdFree (sip);
355 if (StringICmp (accn, id) == 0) return;
356 }
357 }
358 }
359 if (gi < 1) return;
360
361 if (get_var) {
362 flags = 1;
363 }
364 sep = PubSeqSynchronousQuery (gi, 0, flags);
365 if (sep == NULL) return;
366
367 if (do_nuc) {
368 DoSeqEntryToGnbk (sep, GENBANK_FMT, extra);
369 }
370 if (do_prot) {
371 DoSeqEntryToGnbk (sep, GENPEPT_FMT, extra);
372 }
373
374 SeqEntryFree (sep);
375 }
376
377 #define i_argInputFile 0
378 #define d_argDateFile 1
379 #define o_argOutputFile 2
380 #define n_argNewRecords 3
381 #define v_argVariations 4
382 #define m_argMolecule 5
383
384 Args myargs [] = {
385 {"Sequence File Name", "stdin", NULL, NULL,
386 FALSE, 'i', ARG_FILE_IN, 0.0, 0, NULL},
387 {"Date List", NULL, NULL, NULL,
388 TRUE, 'd', ARG_FILE_IN, 0.0, 0, NULL},
389 {"Output File Name", "stdout", NULL, NULL,
390 FALSE, 'o', ARG_FILE_OUT, 0.0, 0, NULL},
391 {"New Records Only", "F", NULL, NULL,
392 TRUE, 'n', ARG_BOOLEAN, 0.0, 0, NULL},
393 {"Fetch SNP Variations", "F", NULL, NULL,
394 TRUE, 'v', ARG_BOOLEAN, 0.0, 0, NULL},
395 {"Molecule (n Nucleotide, p Protein, b Both)", "n", NULL, NULL,
396 FALSE, 'm', ARG_STRING, 0.0, 0, NULL},
397 };
398
399 NLM_EXTERN void AsnPrintNewLine PROTO((AsnIoPtr aip));
400
Main(void)401 Int2 Main (void)
402
403 {
404 AsnIoPtr aip;
405 Char app [64];
406 AsnTypePtr atp;
407 FILE *dfp = NULL;
408 Boolean do_nuc = FALSE;
409 Boolean do_prot = FALSE;
410 XtraPtr extra;
411 FILE *fp;
412 GBSeq gbsq;
413 GBSet gbst;
414 Boolean get_var;
415 Char line [256];
416 Boolean only_new;
417 CharPtr str;
418 Char xmlbuf [128];
419 XtraBlock xtra;
420
421 ErrSetFatalLevel (SEV_MAX);
422 ErrClearOptFlags (EO_SHOW_USERSTR);
423 UseLocalAsnloadDataAndErrMsg ();
424 ErrPathReset ();
425
426 if (! AllObjLoad ()) {
427 Message (MSG_FATAL, "AllObjLoad failed");
428 return 1;
429 }
430 if (! SubmitAsnLoad ()) {
431 Message (MSG_FATAL, "SubmitAsnLoad failed");
432 return 1;
433 }
434 if (! SeqCodeSetLoad ()) {
435 Message (MSG_FATAL, "SeqCodeSetLoad failed");
436 return 1;
437 }
438 if (! GeneticCodeTableLoad ()) {
439 Message (MSG_FATAL, "GeneticCodeTableLoad failed");
440 return 1;
441 }
442 if (! objgbseqAsnLoad ()) {
443 Message (MSG_POSTERR, "objgbseqAsnLoad failed");
444 return 1;
445 }
446
447 sprintf (app, "insdseqget %s", INSDSEQGET_APPLICATION);
448 if (! GetArgs (app, sizeof (myargs) / sizeof (Args), myargs)) {
449 return 0;
450 }
451
452 fp = FileOpen (myargs [i_argInputFile].strvalue, "r");
453 if (fp == NULL) {
454 return 1;
455 }
456
457 if (! StringHasNoText (myargs [d_argDateFile].strvalue)) {
458 dfp = FileOpen (myargs [d_argDateFile].strvalue, "r");
459 if (dfp == NULL) {
460 return 1;
461 }
462 }
463
464 if (GetAppParam ("NCBI", "SETTINGS", "XMLPREFIX", NULL, xmlbuf, sizeof (xmlbuf))) {
465 AsnSetXMLmodulePrefix (StringSave (xmlbuf));
466 }
467
468 MemSet ((Pointer) &xtra, 0, sizeof (XtraBlock));
469 MemSet ((Pointer) &gbsq, 0, sizeof (GBSeq));
470 xtra.gbseq = &gbsq;
471 aip = AsnIoOpen (myargs [o_argOutputFile].strvalue, "wx");
472
473 if (aip == NULL) {
474 Message (MSG_POSTERR, "AsnIoOpen failed");
475 FileClose (fp);
476 return 1;
477 }
478
479 only_new = (Boolean) myargs [n_argNewRecords].intvalue;
480 get_var = (Boolean) myargs [v_argVariations].intvalue;
481
482 str = myargs [m_argMolecule].strvalue;
483 if (StringICmp (str, "n") == 0) {
484 do_nuc = TRUE;
485 } else if (StringICmp (str, "p") == 0) {
486 do_prot = TRUE;
487 } else if (StringICmp (str, "b") == 0) {
488 do_nuc = TRUE;
489 do_prot = TRUE;
490 } else {
491 do_nuc = TRUE;
492 }
493
494 PubSeqFetchEnable ();
495
496 xtra.aip = aip;
497 atp = AsnLinkType (NULL, AsnFind ("INSDSet"));
498 xtra.atp = AsnLinkType (NULL, AsnFind ("INSDSet.E"));
499 if (atp == NULL || xtra.atp == NULL) {
500 Message (MSG_POSTERR, "AsnLinkType or AsnFind failed");
501 return 1;
502 }
503 extra = &xtra;
504 MemSet ((Pointer) &gbst, 0, sizeof (GBSet));
505 AsnOpenStruct (aip, atp, (Pointer) &gbst);
506
507 if (dfp != NULL) {
508 DoQuery (fp, dfp, extra, get_var, do_nuc, do_prot);
509 } else {
510 str = ReadALine (line, sizeof (line), fp);
511 while (str != NULL) {
512 if (! StringHasNoText (str)) {
513 ProcessAccession (str, extra, only_new, get_var, do_nuc, do_prot);
514 }
515 str = ReadALine (line, sizeof (line), fp);
516 }
517 }
518
519 AsnCloseStruct (aip, atp, NULL);
520 AsnPrintNewLine (aip);
521 AsnIoClose (aip);
522
523 FileClose (dfp);
524 FileClose (fp);
525
526 PubSeqFetchDisable ();
527
528 return 0;
529 }
530
531