1 /* @source rebaseextract application
2 **
3 ** Extracts restriction enzyme information from REBASE
4 ** @author Copyright (C) Alan Bleasby (ableasby@hgmp.mrc.ac.uk)
5 ** @@
6 **
7 ** This program is free software; you can redistribute it and/or
8 ** modify it under the terms of the GNU General Public License
9 ** as published by the Free Software Foundation; either version 2
10 ** of the License, or (at your option) any later version.
11 **
12 ** This program is distributed in the hope that it will be useful,
13 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
14 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 ** GNU General Public License for more details.
16 **
17 ** You should have received a copy of the GNU General Public License
18 ** along with this program; if not, write to the Free Software
19 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 ******************************************************************************/
21
22 #include "emboss.h"
23 #include <math.h>
24 #include <stdlib.h>
25 #include <string.h>
26 #include <ctype.h>
27 #include <limits.h>
28
29
30 #define DATANAME "REBASE/embossre.enz"
31 #define DATANAME2 "REBASE/embossre.ref"
32 #define DATANAME3 "REBASE/embossre.sup"
33 #define DATANAME4 "REBASE/embossre.equ"
34 #define EQUGUESS 5000
35 #define SUPGUESS 10000
36
37
38
39 static void rebaseextract_process_pattern(const AjPStr pattern,
40 const AjPStr code,
41 AjPFile outf, AjBool hassup);
42 static void rebaseextract_printEnzHeader(AjPFile outf);
43 static void rebaseextract_printRefHeader(AjPFile outf);
44 static void rebaseextract_printSuppHeader(AjPFile outf);
45
46
47
48
49 /* @prog rebaseextract ********************************************************
50 **
51 ** Extract data from REBASE
52 **
53 ******************************************************************************/
54
main(int argc,char ** argv)55 int main(int argc, char **argv)
56 {
57 AjPFile inf = NULL;
58 AjPFile infp = NULL;
59 AjPFile outf = NULL;
60 AjPFile outf2 = NULL;
61 AjPFile outf3 = NULL;
62
63 AjPStr line;
64 AjPStr code;
65 AjPStr pattern;
66 AjPStr isoschiz;
67 AjPStr meth;
68 AjPStr tit;
69 AjPStr sou;
70 AjPStr comm;
71 AjPStr pfname;
72 AjBool isrefm = ajFalse;
73 AjBool isref = ajFalse;
74 AjBool hassup;
75
76 ajint count;
77 ajlong pos;
78 ajint i;
79
80 AjBool doequ;
81 AjPFile oute = NULL;
82 AjPStr isostr = NULL;
83 AjPTable ptable = NULL;
84 AjPStr key = NULL;
85 AjPStr value = NULL;
86 AjPStrTok handle = NULL;
87 AjPStr token = NULL;
88 AjPStr line2 = NULL;
89 AjPList equ1 = NULL;
90 AjPList equ2 = NULL;
91 AjPStr equstr1 = NULL;
92 AjPStr equstr2 = NULL;
93 AjPStr hstr1 = NULL;
94 AjPStr hstr2 = NULL;
95 const AjPStr htest = NULL;
96
97 AjBool isproto = ajFalse;
98 char c;
99 const char *sptr = NULL;
100
101 AjPTable hassuptable = NULL;
102
103
104 embInit("rebaseextract",argc,argv);
105
106 inf = ajAcdGetInfile("infile");
107 infp = ajAcdGetInfile("protofile");
108 doequ = ajAcdGetBoolean("equivalences");
109
110 equ1 = ajListNew();
111 equ2 = ajListNew();
112
113 hassuptable = ajTablestrNew(SUPGUESS);
114
115
116 pfname = ajStrNewC(DATANAME);
117 outf = ajDatafileNewOutNameS(pfname);
118 rebaseextract_printEnzHeader(outf);
119
120 ajStrAssignC(&pfname,DATANAME2);
121 outf2 = ajDatafileNewOutNameS(pfname);
122 rebaseextract_printRefHeader(outf2);
123
124 ajStrAssignC(&pfname,DATANAME3);
125 outf3 = ajDatafileNewOutNameS(pfname);
126 rebaseextract_printSuppHeader(outf3);
127
128 if(doequ)
129 {
130 ajStrAssignC(&pfname,DATANAME4);
131 oute = ajDatafileNewOutNameS(pfname);
132 ptable = ajTablestrNew(EQUGUESS);
133 isostr = ajStrNew();
134 }
135 ajStrDel(&pfname);
136
137 line = ajStrNew();
138 line2 = ajStrNew();
139 code = ajStrNew();
140 pattern = ajStrNew();
141 isoschiz = ajStrNew();
142 meth = ajStrNew();
143 tit = ajStrNew();
144 sou = ajStrNew();
145 comm = ajStrNew();
146 token = ajStrNew();
147
148 /*
149 * Extract Supplier information
150 */
151 while(ajReadlineTrim(inf,&line))
152 {
153 if(ajStrFindC(line,"withrefm.")>=0)
154 isrefm = ajTrue;
155
156 if(ajStrFindC(line,"withref.")>=0)
157 isref = ajTrue;
158
159 if(strstr(ajStrGetPtr(line),"REBASE codes"))
160 break;
161 }
162
163
164 while(ajReadlineTrim(infp,&line))
165 {
166 if(ajStrFindC(line,"proto.")>=0)
167 isproto = ajTrue;
168
169
170 if(strstr(ajStrGetPtr(line),"Rich Roberts"))
171 break;
172 }
173
174 if(!isrefm)
175 {
176 if(isref)
177 ajFatal("WITHREF file specified by mistake. Use WITHREFM instead");
178 else
179 ajFatal("Invalid withrefm file");
180 }
181
182
183 if(!isproto)
184 ajFatal("Invalid PROTO file specified");
185
186
187
188 while(doequ && ajReadlineTrim(infp,&line))
189 {
190 if(!ajStrGetLen(line))
191 continue;
192 sptr = ajStrGetPtr(line);
193 c = *sptr;
194 if(c>='A' && c<='Z')
195 {
196 while(*sptr!=' ')
197 ++sptr;
198 while(*sptr==' ')
199 ++sptr;
200
201 key = ajStrNew();
202 value = ajStrNewC(sptr);
203 ajStrRemoveWhite(&value);
204 ajFmtScanS(line,"%S",&key);
205 ajTablePut(ptable,(void *)key, (void *)value);
206 }
207 }
208
209 if(!ajReadlineTrim(inf,&line))
210 ajFatal("No Supplier Information Found");
211
212 if(!ajReadlineTrim(inf,&line))
213 ajFatal("Unexpected EOF");
214
215 while(ajStrGetLen(line))
216 {
217 ajStrRemoveWhiteExcess(&line);
218 ajFmtPrintF(outf3,"%s\n",ajStrGetPtr(line));
219 if(!ajReadlineTrim(inf,&line))
220 ajFatal("Unexpected EOF");
221 }
222 ajFileClose(&outf3);
223
224
225
226 while(ajReadlineTrim(inf,&line))
227 {
228 /* Get RE */
229 if(!ajStrPrefixC(line,"<1>"))
230 continue;
231
232 ajStrAssignC(&code,ajStrGetPtr(line)+3);
233
234 /* Get isoschizomers */
235 if(!ajReadlineTrim(inf,&line2))
236 ajFatal("Unexpected EOF");
237
238 if(ajStrGetLen(line2)>3)
239 ajStrAssignC(&isoschiz,ajStrGetPtr(line2)+3);
240 else
241 ajStrAssignC(&isoschiz,"");
242
243
244 /* Get recognition sequence */
245 if(!ajReadlineTrim(inf,&line))
246 ajFatal("Unexpected EOF");
247
248 if(ajStrGetLen(line)>3)
249 {
250 ajStrAssignC(&pattern,ajStrGetPtr(line)+3);
251
252 if(doequ && ajStrGetLen(line2)>3)
253 {
254 ajStrAssignS(&isostr,isoschiz);
255 handle = ajStrTokenNewC(isostr,"\t\n>,");
256 ajStrTokenNextParse(handle, &token);
257
258 if((value=ajTableFetchmodS(ptable, token)))
259 {
260 if(ajStrMatchS(value,pattern))
261 {
262 equstr1 = ajStrNew();
263 equstr2 = ajStrNew();
264 ajStrAssignS(&equstr1,code);
265 ajStrAssignS(&equstr2, token);
266 ajListPushAppend(equ1,(void *) equstr1);
267 ajListPushAppend(equ2,(void *) equstr2);
268 }
269 }
270 ajStrTokenDel(&handle);
271 }
272 }
273 else
274 ajStrAssignC(&pattern,"");
275
276 /* Get methylation */
277 if(!ajReadlineTrim(inf,&line))
278 ajFatal("Unexpected EOF");
279
280 if(ajStrGetLen(line)>3)
281 ajStrAssignC(&meth,ajStrGetPtr(line)+3);
282 else
283 ajStrAssignC(&meth,"");
284
285 /* Get title */
286 if(!ajReadlineTrim(inf,&line))
287 ajFatal("Unexpected EOF");
288
289 if(ajStrGetLen(line)>3)
290 ajStrAssignC(&tit,ajStrGetPtr(line)+3);
291 else
292 ajStrAssignC(&tit,"");
293
294 /* Get source */
295 if(!ajReadlineTrim(inf,&line))
296 ajFatal("Unexpected EOF");
297
298 if(ajStrGetLen(line)>3)
299 ajStrAssignC(&sou,ajStrGetPtr(line)+3);
300 else
301 ajStrAssignC(&sou,"");
302
303 /* Get commercial supplier */
304 if(!ajReadlineTrim(inf,&line))
305 ajFatal("Unexpected EOF");
306
307 hstr1 = ajStrNew();
308 hstr2 = ajStrNew();
309
310 hassup=ajFalse;
311 if(ajStrGetLen(line)>3)
312 {
313 hassup = ajTrue;
314 ajStrAssignC(&comm,ajStrGetPtr(line)+3);
315 ajStrAssignC(&hstr2,"Y");
316 }
317 else
318 {
319 ajStrAssignC(&comm,"");
320 ajStrAssignC(&hstr2,"N");
321 }
322
323 ajStrAssignS(&hstr1,code);
324 ajTablePut(hassuptable,(void *)hstr1, (void *)hstr2);
325
326 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(code));
327 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(tit));
328 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(isoschiz));
329 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(meth));
330 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(sou));
331 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(comm));
332
333 /* Get references */
334 count = -1;
335 pos = ajFileResetPos(inf);
336 while(ajStrGetLen(line))
337 {
338 if(!ajReadlineTrim(inf,&line))
339 break;
340 ++count;
341 }
342 ajFileSeek(inf,pos,0);
343
344
345 if(!ajReadlineTrim(inf,&line))
346 ajFatal("Unexpected EOF");
347
348 if(ajStrGetLen(line)==3)
349 ajFmtPrintF(outf2,"0\n");
350 else
351 {
352 ajFmtPrintF(outf2,"%d\n%s\n",count,ajStrGetPtr(line)+3);
353 for(i=1;i<count;++i)
354 {
355 if(!ajReadlineTrim(inf,&line))
356 ajFatal("Unexpected EOF");
357 ajFmtPrintF(outf2,"%s\n",ajStrGetPtr(line));
358 }
359 }
360 ajFmtPrintF(outf2,"//\n");
361
362
363 rebaseextract_process_pattern(pattern,code,outf,hassup);
364
365 }
366
367
368 if(doequ)
369 {
370 while(ajListPop(equ1,(void **)&equstr1))
371 {
372 ajListPop(equ2,(void **)&equstr2);
373
374 if(!(htest=ajTableFetchS(hassuptable, equstr2)))
375 ajFatal("Expected supplier value not found for '%S' (enzyme '%S')",
376 equstr1,equstr2);
377 if(ajStrMatchC(htest,"N"))
378 ajStrAppendC(&equstr2,"*");
379
380 ajFmtPrintF(oute,"%S %S\n",equstr1,equstr2);
381 ajStrDel(&equstr1);
382 ajStrDel(&equstr2);
383 }
384
385 ajStrDel(&isostr);
386 ajFileClose(&oute);
387 ajTablestrFree(&ptable);
388 }
389
390 ajTablestrFree(&hassuptable);
391 ajFileClose(&inf);
392 ajFileClose(&infp);
393 ajFileClose(&outf);
394 ajFileClose(&outf2);
395
396
397 ajStrDel(&line);
398 ajStrDel(&line2);
399 ajStrDel(&token);
400 ajStrDel(&isoschiz);
401 ajStrDel(&tit);
402 ajStrDel(&meth);
403 ajStrDel(&sou);
404 ajStrDel(&comm);
405 ajStrDel(&pattern);
406 ajStrDel(&code);
407
408 ajListFree(&equ1);
409 ajListFree(&equ2);
410
411 embExit();
412
413 return 0;
414 }
415
416
417
418
419 /* @funcstatic rebaseextract_process_pattern **********************************
420 **
421 ** Convert rebase pattern into emboss pattern
422 **
423 ** @param [r] pattern [const AjPStr] rebase recognition sequence
424 ** @param [r] code [const AjPStr] re name
425 ** @param [u] outf [AjPFile] outfile
426 ** @param [r] hassup [AjBool] has a supplier
427 ** @@
428 ******************************************************************************/
429
rebaseextract_process_pattern(const AjPStr pattern,const AjPStr code,AjPFile outf,AjBool hassup)430 static void rebaseextract_process_pattern(const AjPStr pattern,
431 const AjPStr code,
432 AjPFile outf,
433 AjBool hassup)
434 {
435 AjPStr temp = NULL;
436 AjPStr ppat = NULL;
437 AjPStrTok tokens = NULL;
438 AjPStr tmppattern = NULL;
439
440 const char *p;
441 const char *q;
442 char *r;
443 const char *t;
444
445 AjBool hascarat;
446
447 ajint cut1;
448 ajint cut2;
449 ajint cut3;
450 ajint cut4;
451 ajint len;
452 ajint ncuts;
453 ajint nc;
454 ajint i;
455 AjBool blunt = ajFalse;
456
457 tmppattern = ajStrNewS(pattern);
458 ajStrFmtUpper(&tmppattern);
459 temp = ajStrNew();
460 ppat = ajStrNew();
461
462 tokens=ajStrTokenNewC(tmppattern,",");
463
464 while(ajStrTokenNextParseC(tokens,",",&ppat))
465 {
466 ajFmtPrintF(outf,"%S\t",code);
467
468 ajStrAssignS(&temp,ppat);
469
470 hascarat = ajFalse;
471 p = ajStrGetPtr(ppat);
472
473 if(*p=='?')
474 {
475 ajFmtPrintF(outf,"?\t0\t0\t0\t0\t0\t0\t0\n");
476 continue;
477 }
478
479
480 t = p;
481 if(*p=='(')
482 {
483 sscanf(p+1,"%d/%d",&cut1,&cut2);
484 q=p+1;
485 if(!(q=strchr(q,(ajint)'(')))
486 ajFatal("Bad pattern %S in %S",code,pattern);
487 sscanf(q+1,"%d/%d",&cut3,&cut4);
488 cut1 *= -1;
489 cut2 *= -1;
490 --cut1;
491 --cut2;
492
493 if(!(p=strchr(p,(ajint)')')))
494 ajFatal("%S mismatched parentheses",code);
495
496 p = ajStrGetPtr(ppat);
497 r = ajStrGetuniquePtr(&temp);
498 while(*p)
499 {
500 if(*p>='A' && *p<='Z')
501 *r++ = *p;
502 ++p;
503 }
504 *r = '\0';
505 ajStrAssignC(&ppat,ajStrGetPtr(temp));
506 len=ajStrGetLen(ppat);
507 cut3 += len;
508 cut4 += len;
509 ncuts = 4;
510
511 if(cut1==cut2 && cut3==cut4)
512 blunt = ajTrue;
513 else
514 blunt = ajFalse;
515
516 p = t;
517 }
518 else
519 {
520 ncuts = 2;
521 cut3 = cut4 = 0;
522 if((p=strchr(p,(ajint)'(')))
523 {
524 sscanf(p+1,"%d/%d",&cut1,&cut2);
525 if(cut1==cut2)
526 blunt = ajTrue;
527 else
528 blunt = ajFalse;
529
530 p = ajStrGetPtr(ppat);
531 r = ajStrGetuniquePtr(&temp);
532 while(*p)
533 {
534 if(*p>='A' && *p<='Z')
535 *r++ = *p;
536 ++p;
537 }
538 *r = '\0';
539 ajStrAssignC(&ppat,ajStrGetPtr(temp));
540 len=ajStrGetLen(ppat);
541 cut1 += len;
542 cut2 += len;
543 if(cut1<=0)
544 --cut1;
545
546 if(cut2<=0)
547 --cut2;
548 }
549 else /* probably a carat */
550 {
551 p = ajStrGetPtr(ppat);
552 r = ajStrGetuniquePtr(&temp);
553 cut1 = 0;
554 hascarat = ajFalse;
555
556 while(*p)
557 {
558 if(*p>='A' && *p<='Z')
559 {
560 *r++ = *p;
561 if(!hascarat)
562 ++cut1;
563 }
564 if(*p=='^')
565 hascarat = ajTrue;
566
567 ++p;
568 }
569 *r = '\0';
570 ajStrAssignC(&ppat,ajStrGetPtr(temp));
571 len = ajStrGetLen(ppat);
572 if(!hascarat)
573 {
574 ncuts = 0;
575 blunt = ajFalse;
576 cut1 = 0;
577 cut2 = 0;
578 }
579 else
580 {
581 if(len==cut1*2)
582 {
583 blunt = ajTrue;
584 cut2 = cut1;
585 }
586 else if(!cut1)
587 {
588 cut1 = -1;
589 cut2 = len;
590 }
591 else
592 {
593 p = ajStrGetPtr(ppat);
594 if(p[cut1-1]=='N' && cut1==len)
595 {
596 for(i=cut1-1,nc=0;i>-1;--i)
597 if(p[i]=='N')
598 ++nc;
599 cut2 = len-cut1-nc-1;
600 }
601 else if(cut1==len)
602 cut2 = -1;
603 else
604 cut2 = len-cut1;
605 blunt = ajFalse;
606 }
607 }
608 }
609 }
610
611 /* Mark RE's with no suppliers with lc sequence */
612 if(!hassup)
613 ajStrFmtLower(&ppat);
614
615
616 if(ncuts==4)
617 ajFmtPrintF(outf,"%s\t%d\t%d\t%d\t%d\t%d\t%d\t%d\n",
618 ajStrGetPtr(ppat),len,ncuts,blunt,cut1,cut2,cut3,cut4);
619 else
620 ajFmtPrintF(outf,"%s\t%d\t%d\t%d\t%d\t%d\t0\t0\n",
621 ajStrGetPtr(ppat),
622 len,ncuts,blunt,cut1,cut2);
623 }
624
625 ajStrDel(&tmppattern);
626 ajStrDel(&temp);
627 ajStrDel(&ppat);
628 ajStrTokenDel(&tokens);
629
630 return;
631 }
632
633
634
635
636 /* @funcstatic rebaseextract_printEnzHeader ***********************************
637 **
638 ** print comments at start of embossre.enz
639 **
640 ** @param [w] outf [AjPFile] outfile
641 ** @@
642 ******************************************************************************/
643
rebaseextract_printEnzHeader(AjPFile outf)644 static void rebaseextract_printEnzHeader(AjPFile outf)
645 {
646 ajFmtPrintF(outf,"# REBASE enzyme patterns for EMBOSS\n#\n");
647 ajFmtPrintF(outf,"# Format:\n");
648 ajFmtPrintF(outf,"# name<ws>pattern<ws>len<ws>ncuts<ws>");
649 ajFmtPrintF(outf,"blunt<ws>c1<ws>c2<ws>c3<ws>c4\n");
650 ajFmtPrintF(outf,"#\n");
651 ajFmtPrintF(outf,"# Where:\n");
652 ajFmtPrintF(outf,"# name = name of enzyme\n");
653 ajFmtPrintF(outf,"# pattern = recognition site\n");
654 ajFmtPrintF(outf,"# len = length of pattern\n");
655 ajFmtPrintF(outf,"# ncuts = number of cuts made by enzyme\n");
656 ajFmtPrintF(outf,"# Zero represents unknown\n");
657 ajFmtPrintF(outf,"# blunt = true if blunt end cut, false if sticky\n");
658 ajFmtPrintF(outf,"# c1 = First 5' cut\n");
659 ajFmtPrintF(outf,"# c2 = First 3' cut\n");
660 ajFmtPrintF(outf,"# c3 = Second 5' cut\n");
661 ajFmtPrintF(outf,"# c4 = Second 3' cut\n#\n# Examples:\n");
662 ajFmtPrintF(outf,"# AAC^TGG -> 6 2 1 3 3 0 0\n");
663 ajFmtPrintF(outf,"# A^ACTGG -> 6 2 0 1 5 0 0\n");
664 ajFmtPrintF(outf,"# AACTGG -> 6 0 0 0 0 0 0\n");
665 ajFmtPrintF(outf,"# AACTGG(-5/-1) -> 6 2 0 1 5 0 0\n");
666 ajFmtPrintF(outf,"# (8/13)GACNNNNNNTCA(12/7) -> 12 4 0 -9 -14 24 19\n");
667 ajFmtPrintF(outf,"#\n");
668 ajFmtPrintF(outf,"# i.e. cuts are always to the right of the given\n");
669 ajFmtPrintF(outf,"# residue and sequences are always with reference to\n");
670 ajFmtPrintF(outf,"# the 5' strand.\n");
671 ajFmtPrintF(outf,"# Sequences are numbered ... -3 -2 -1 1 2 3 ... with\n");
672 ajFmtPrintF(outf,"# the first residue of the pattern at base number 1.\n");
673 ajFmtPrintF(outf,"#\n");
674
675 return;
676 }
677
678
679
680
681 /* @funcstatic rebaseextract_printRefHeader ***********************************
682 **
683 ** Print header to embossre.ref
684 **
685 ** @param [w] outf [AjPFile] outfile
686 ** @@
687 ******************************************************************************/
688
rebaseextract_printRefHeader(AjPFile outf)689 static void rebaseextract_printRefHeader(AjPFile outf)
690 {
691 ajFmtPrintF(outf,"# REBASE enzyme information for EMBOSS\n#\n");
692 ajFmtPrintF(outf,"# Format:\n");
693 ajFmtPrintF(outf,"# Line 1: Name of Enzyme\n");
694 ajFmtPrintF(outf,"# Line 2: Organism\n");
695 ajFmtPrintF(outf,"# Line 3: Isoschizomers\n");
696 ajFmtPrintF(outf,"# Line 4: Methylation\n");
697 ajFmtPrintF(outf,"# Line 5: Source\n");
698 ajFmtPrintF(outf,"# Line 6: Suppliers\n");
699 ajFmtPrintF(outf,"# Line 7: Number of following references\n");
700 ajFmtPrintF(outf,"# Lines 8..n: References\n");
701 ajFmtPrintF(outf,"# // (end of entry marker)\n");
702 ajFmtPrintF(outf,"#\n");
703
704 return;
705 }
706
707
708
709
710 /* @funcstatic rebaseextract_printSuppHeader **********************************
711 **
712 ** Print header to embossre.sup
713 **
714 ** @param [w] outf [AjPFile] outfile
715 ** @@
716 ******************************************************************************/
717
rebaseextract_printSuppHeader(AjPFile outf)718 static void rebaseextract_printSuppHeader(AjPFile outf)
719 {
720 ajFmtPrintF(outf,"# REBASE Supplier information for EMBOSS\n#\n");
721 ajFmtPrintF(outf,"# Format:\n");
722 ajFmtPrintF(outf,"# Code of Supplier<ws>Supplier name\n");
723 ajFmtPrintF(outf,"#\n");
724
725 return;
726 }
727