1 /* @source getorf application
2 **
3 ** Finds and extracts open reading frames (ORFs)
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24
25
26
27
28 static void getorf_WriteORF(const AjPSeq seq, ajint len, ajint seqlen,
29 AjBool sense, ajint find, ajint *orf_no,
30 ajint start, ajint pos, const AjPStr str,
31 AjPSeqout seqout, ajint around);
32
33 static void getorf_AppORF(ajint find, AjPStr *str,
34 const char *chrseq, ajint pos,
35 char aa);
36
37 static void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable,
38 ajuint minsize, ajuint maxsize, AjPSeqout seqout,
39 AjBool sense, AjBool circular, ajint find,
40 ajint *orf_no, AjBool methionine, ajint around);
41
42
43
44
45 /* types of control codon */
46 #define START 1
47 #define STOP -1
48
49 /* types of ORF to find */
50 #define P_STOP2STOP 0
51 #define P_START2STOP 1
52 #define N_STOP2STOP 2
53 #define N_START2STOP 3
54 #define AROUND_START 4
55 #define AROUND_INIT_STOP 5
56 #define AROUND_END_STOP 6
57
58
59
60
61 /* @prog getorf ***************************************************************
62 **
63 ** Finds and extracts open reading frames (ORFs)
64 **
65 ******************************************************************************/
66
main(int argc,char ** argv)67 int main(int argc, char **argv)
68 {
69
70 AjPSeqall seqall;
71 AjPSeqout seqout;
72 AjPStr tablestr;
73 ajint table;
74 ajuint minsize;
75 ajuint maxsize;
76 AjPStr findstr;
77 ajint find;
78 AjBool methionine;
79 AjBool circular;
80 AjBool reverse;
81 ajint around;
82
83 AjPSeq seq = NULL;
84 AjPTrn trnTable;
85 AjPStr sseq = NULL; /* sequence string */
86
87 /* ORF number to append to name of sequence to create unique name */
88 ajint orf_no;
89
90
91 AjBool sense; /* ajTrue = forward sense */
92 ajint len;
93
94 embInit("getorf", argc, argv);
95
96 seqout = ajAcdGetSeqoutall("outseq");
97 seqall = ajAcdGetSeqall("sequence");
98 tablestr = ajAcdGetListSingle("table");
99 minsize = ajAcdGetInt("minsize");
100 maxsize = ajAcdGetInt("maxsize");
101 findstr = ajAcdGetListSingle("find");
102 methionine = ajAcdGetBoolean("methionine");
103 circular = ajAcdGetBoolean("circular");
104 reverse = ajAcdGetBoolean("reverse");
105 around = ajAcdGetInt("flanking");
106
107
108 /* initialise the translation table */
109 ajStrToInt(tablestr, &table);
110 trnTable = ajTrnNewI(table);
111
112 /* what sort of ORF are we looking for */
113 ajStrToInt(findstr, &find);
114
115 /*
116 ** get the minimum size converted to protein length if storing
117 ** protein sequences
118 */
119 if(find == P_STOP2STOP || find == P_START2STOP || find == AROUND_START)
120 {
121 minsize /= 3;
122 maxsize /= 3;
123 }
124
125 while(ajSeqallNext(seqall, &seq))
126 {
127 ajSeqTrim(seq);
128
129 orf_no = 1; /* number of the next ORF */
130 sense = ajTrue; /* forward sense initially */
131
132 /* get the length of the sequence */
133 len = ajSeqGetLen(seq);
134
135 /*
136 ** if the sequence is circular, append it to itself to triple its
137 ** length so can deal easily with wrapped ORFs, but don't update
138 ** len
139 */
140 if(circular)
141 {
142 ajStrAssignS(&sseq, ajSeqGetSeqS(seq));
143 ajStrAppendS(&sseq, ajSeqGetSeqS(seq));
144 ajStrAppendS(&sseq, ajSeqGetSeqS(seq));
145 ajSeqAssignSeqS(seq, sseq);
146 }
147
148 /* find the ORFs */
149 getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense,
150 circular, find, &orf_no, methionine, around);
151
152 /* now reverse complement the sequence and do it again */
153 if(reverse)
154 {
155 sense = ajFalse;
156 ajSeqReverseForce(seq);
157 getorf_FindORFs(seq, len, trnTable, minsize, maxsize, seqout, sense,
158 circular, find, &orf_no, methionine,
159 around);
160 }
161 }
162
163 ajSeqoutClose(seqout);
164 ajTrnDel(&trnTable);
165
166 ajSeqallDel(&seqall);
167 ajSeqDel(&seq);
168 ajStrDel(&sseq);
169 ajSeqoutDel(&seqout);
170 ajStrDel(&tablestr);
171 ajStrDel(&findstr);
172
173 embExit();
174
175 return 0;
176 }
177
178
179
180
181 /* @funcstatic getorf_FindORFs ************************************************
182 **
183 ** finds all orfs in the current sense and writes them out
184 **
185 ** @param [r] seq [const AjPSeq] Nucleotide sequence
186 ** @param [r] len [ajint] Sequence length
187 ** @param [r] trnTable [const AjPTrn] Translation table
188 ** @param [r] minsize [ajuint] Minimum size ORF to find
189 ** @param [r] maxsize [ajuint] Maximum size ORF to find
190 ** @param [u] seqout [AjPSeqout] Sequence output object
191 ** @param [r] sense [AjBool] Forward (sense) strand if true, else reverse strand
192 ** @param [r] circular [AjBool] True if sequence is circular
193 ** @param [r] find [ajint] Find code (see main program)
194 ** @param [w] orf_no [ajint*] ORF counter updated
195 ** @param [r] methionine [AjBool] If true report all start codond as 'M'
196 ** @param [r] around [ajint] Number of bases around start/stop codons
197 ** @@
198 ******************************************************************************/
199
getorf_FindORFs(const AjPSeq seq,ajint len,const AjPTrn trnTable,ajuint minsize,ajuint maxsize,AjPSeqout seqout,AjBool sense,AjBool circular,ajint find,ajint * orf_no,AjBool methionine,ajint around)200 static void getorf_FindORFs(const AjPSeq seq, ajint len, const AjPTrn trnTable,
201 ajuint minsize, ajuint maxsize, AjPSeqout seqout,
202 AjBool sense, AjBool circular, ajint find,
203 ajint *orf_no, AjBool methionine, ajint around)
204 {
205 AjBool ORF[3]; /* true if found an ORF */
206 AjBool LASTORF[3]; /* true if hit the end of an ORF past
207 the end on the genome in this
208 frame */
209 AjBool GOTSTOP[3]; /* true if found a STOP in a circular
210 genome's frame when
211 find = P_STOP2STOP or
212 N_STOP2STOP */
213 ajint start[3]; /* possible starting position of the
214 three frames */
215 ajint pos;
216 ajint codon;
217 char aa;
218 ajint frame;
219 AjPStr newstr[3]; /* strings of the three frames of ORF
220 sequences that we are growing */
221 AjPSeq pep = NULL;
222 ajint i;
223
224 ajint seqlen;
225 const char *chrseq;
226
227 seqlen = ajSeqGetLen(seq);
228 chrseq = ajSeqGetSeqC(seq);
229
230 /* initialise the ORF sequences */
231 newstr[0] = NULL;
232 newstr[1] = NULL;
233 newstr[2] = NULL;
234
235 /*
236 ** initialise flags for found the last ORF past the end of a circular
237 ** genome
238 */
239 LASTORF[0] = ajFalse;
240 LASTORF[1] = ajFalse;
241 LASTORF[2] = ajFalse;
242
243 /* initialise flags for found at least one STOP codon in a frame */
244 GOTSTOP[0] = ajFalse;
245 GOTSTOP[1] = ajFalse;
246 GOTSTOP[2] = ajFalse;
247
248 if(circular || find == P_START2STOP || find == N_START2STOP ||
249 find == AROUND_START)
250 {
251 ORF[0] = ajFalse;
252 ORF[1] = ajFalse;
253 ORF[2] = ajFalse;
254 }
255 else
256 {
257 /*
258 ** assume already in a ORF so we get ORFs at the start of the
259 ** sequence
260 */
261 ORF[0] = ajTrue;
262 ORF[1] = ajTrue;
263 ORF[2] = ajTrue;
264 start[0] = 0;
265 start[1] = 1;
266 start[2] = 2;
267 }
268
269 for(pos=0; pos<seqlen-2; pos++)
270 {
271 codon = ajTrnCodonstrTypeC(trnTable, &chrseq[pos], &aa);
272 frame = pos % 3;
273 ajDebug("len=%d, Pos=%d, Frame=%d start/stop=%d, aa=%c '%c%c%c'\n",
274 len, pos, frame, codon, aa,
275 chrseq[pos], chrseq[pos+1], chrseq[pos+2]);
276
277 /* don't want to find extra ORFs when already been round circ */
278 if(LASTORF[frame])
279 continue;
280
281 if(find == P_STOP2STOP || find == N_STOP2STOP ||
282 find == AROUND_INIT_STOP || find == AROUND_END_STOP)
283 { /* look for stop codon to begin reporting ORF */
284 /* note that there was at least one STOP in a circular genome */
285 if(codon == STOP)
286 {
287 GOTSTOP[frame] = ajTrue;
288 }
289
290 /* write details if a STOP is hit or the end of the sequence */
291 if(codon == STOP || pos >= seqlen-5)
292 {
293
294 /*
295 ** End of the sequence? If so, append any
296 ** last codon to the sequence - otherwise, ignore the STOP
297 ** codon
298 */
299 if(codon != STOP)
300 getorf_AppORF(find, &newstr[frame], chrseq, pos,
301 aa);
302
303 /* Already have a sequence to write out? */
304 if(ORF[frame])
305 {
306 if(ajStrGetLen(newstr[frame]) >= minsize &&
307 ajStrGetLen(newstr[frame]) <= maxsize)
308 {
309 /* create a new sequence */
310 if(codon == STOP)
311 getorf_WriteORF(seq, len, seqlen, sense,
312 find, orf_no, start[frame],
313 pos-1, newstr[frame],
314 seqout, around);
315 else
316 getorf_WriteORF(seq, len, seqlen, sense,
317 find, orf_no, start[frame],
318 pos+2, newstr[frame],
319 seqout, around);
320 }
321
322 ajStrSetClear(&newstr[frame]);
323 }
324
325 /*
326 ** if its a circular genome and the STOP codon hits past
327 ** the end of the genome in all frames, then break
328 */
329 if(circular && pos >= len)
330 {
331 ORF[frame] = ajFalse; /* past the end of the genome */
332 LASTORF[frame] = ajTrue; /* finished getting ORFs */
333 if(LASTORF[0] && LASTORF[1] && LASTORF[2])
334 break;
335 }
336 else
337 {
338 /*
339 ** hit a STOP, therefore a potential ORF to write
340 ** out next time, even if the genome is circular
341 */
342 ORF[frame] = ajTrue;
343 start[frame] = pos+3; /* next start of the ORF */
344 }
345
346 }
347 else if(ORF[frame])
348 /* append sequence to newstr if in an ORF */
349 getorf_AppORF(find, &newstr[frame], chrseq, pos, aa);
350 }
351 else /* Look for start: P_START2STOP N_START2STOP AROUND_START */
352 {
353
354 if(codon == START && !ORF[frame])
355 {
356 /* not in a ORF already and found a START */
357 if(pos < len)
358 {
359 /*
360 ** reset the newstr to zero length to enable
361 ** storing the ORF for this
362 */
363 ajStrSetClear(&newstr[frame]);
364 ORF[frame] = ajTrue; /* now in an ORF */
365 start[frame] = pos; /* start of the ORF for this frame */
366 if(methionine)
367 getorf_AppORF(find, &newstr[frame], chrseq,
368 pos, 'M');
369 else
370 getorf_AppORF(find, &newstr[frame], chrseq,
371 pos, aa);
372 }
373 }
374 else if(codon == STOP)
375 {
376 /* hit a STOP */
377
378 /* Already have a sequence to write out? */
379 if(ORF[frame])
380 {
381 ORF[frame] = ajFalse; /* not in an ORF */
382
383 if(ajStrGetLen(newstr[frame]) >= minsize &&
384 ajStrGetLen(newstr[frame]) <= maxsize)
385 {
386 /* create a new sequence */
387 getorf_WriteORF(seq, len, seqlen, sense,
388 find, orf_no, start[frame],
389 pos-1, newstr[frame],
390 seqout, around);
391 }
392 }
393
394 /*
395 ** if a circular genome and hit the STOP past
396 ** the end of the genome in all frames, then break
397 */
398 if(circular && pos >= len)
399 {
400 LASTORF[frame] = ajTrue; /* finished getting ORFs */
401 if(LASTORF[0] && LASTORF[1] && LASTORF[2]) break;
402 }
403
404 ajStrSetClear(&newstr[frame]);
405 }
406 else if(pos >= seqlen-5)
407 {
408 /* hit the end of the sequence without a stop */
409
410 /* Already have a sequence to write out? */
411 if(ORF[frame])
412 {
413 ORF[frame] = ajFalse; /* not in an ORF */
414
415 /*
416 ** End of the sequence? If so, append any
417 ** last codon to the sequence - otherwise, ignore the
418 ** STOP codon
419 */
420 if(pos >= seqlen-5 && pos < seqlen-2)
421 getorf_AppORF(find, &newstr[frame], chrseq,
422 pos, aa);
423
424 if(ajStrGetLen(newstr[frame]) >= minsize &&
425 ajStrGetLen(newstr[frame]) <= maxsize)
426 {
427 /* create a new sequence */
428 getorf_WriteORF(seq, len, seqlen, sense,
429 find, orf_no, start[frame],
430 pos+2, newstr[frame],
431 seqout, around);
432 }
433 }
434
435 /*
436 ** if a circular genome and hit the STOP past
437 ** the end of the genome in all frames, then break
438 */
439 if(circular && pos >= len)
440 {
441 LASTORF[frame] = ajTrue; /* finished getting ORFs */
442 if(LASTORF[0] && LASTORF[1] && LASTORF[2]) break;
443 }
444
445 ajStrSetClear(&newstr[frame]);
446 }
447 else
448 if(ORF[frame])
449 getorf_AppORF(find, &newstr[frame], chrseq, pos,
450 aa);
451
452 }
453 }
454
455 /*
456 ** Currently miss reporting a STOP-to-STOP ORF that is
457 ** the full length of a circular genome when there are no STOP codons in
458 ** that frame
459 */
460 if((find == P_STOP2STOP || find == N_STOP2STOP) && circular)
461 {
462 if(!GOTSTOP[0])
463 {
464 /* translate frame 1 into pep */
465 pep = ajTrnSeqOrig(trnTable, seq, 1);
466 if(ajSeqGetLen(pep) >= minsize &&
467 ajSeqGetLen(pep) <= maxsize)
468 getorf_WriteORF(seq, len, seqlen, sense, find, orf_no,
469 0, seqlen-1, ajSeqGetSeqS(pep), seqout,
470 around);
471 ajSeqDel(&pep);
472 }
473
474 if(!GOTSTOP[1])
475 {
476 /* translate frame 2 into pep */
477 pep = ajTrnSeqOrig(trnTable, seq, 2);
478 if(ajSeqGetLen(pep) >= minsize &&
479 ajSeqGetLen(pep) <= maxsize)
480 getorf_WriteORF(seq, len, seqlen, sense, find, orf_no,
481 1, seqlen-1, ajSeqGetSeqS(pep), seqout,
482 around);
483 ajSeqDel(&pep);
484 }
485
486 if(!GOTSTOP[2])
487 {
488 /* translate frame 3 into pep */
489 pep = ajTrnSeqOrig(trnTable, seq, 3);
490 if(ajSeqGetLen(pep) >= minsize &&
491 ajSeqGetLen(pep) >= maxsize)
492 getorf_WriteORF(seq, len, seqlen, sense, find, orf_no,
493 2, seqlen-1, ajSeqGetSeqS(pep), seqout,
494 around);
495 ajSeqDel(&pep);
496 }
497 }
498
499 for(i=0;i<3;++i)
500 ajStrDel(&newstr[i]);
501
502 return;
503 }
504
505
506
507
508 /* @funcstatic getorf_WriteORF ************************************************
509 **
510 ** Undocumented.
511 **
512 ** @param [r] seq [const AjPSeq] Undocumented
513 ** @param [r] len [ajint] Undocumented
514 ** @param [r] seqlen [ajint] Undocumented
515 ** @param [r] sense [AjBool] Undocumented
516 ** @param [r] find [ajint] Undocumented
517 ** @param [w] orf_no [ajint*] Undocumented
518 ** @param [r] start [ajint] Undocumented
519 ** @param [r] pos [ajint] Undocumented
520 ** @param [r] str [const AjPStr] Undocumented
521 ** @param [u] seqout [AjPSeqout] Undocumented
522 ** @param [r] around [ajint] Undocumented
523 ** @@
524 ******************************************************************************/
525
getorf_WriteORF(const AjPSeq seq,ajint len,ajint seqlen,AjBool sense,ajint find,ajint * orf_no,ajint start,ajint pos,const AjPStr str,AjPSeqout seqout,ajint around)526 static void getorf_WriteORF(const AjPSeq seq,
527 ajint len, ajint seqlen, AjBool sense,
528 ajint find, ajint *orf_no, ajint start, ajint pos,
529 const AjPStr str, AjPSeqout seqout, ajint around)
530 {
531 AjPSeq new;
532 AjPStr name = NULL; /* name of the ORF */
533 AjPStr value = NULL; /* string value of the ORF number */
534 ajint s;
535 ajint e; /* start and end positions */
536 AjPStr aroundstr = NULL; /* holds sequence string around the
537 codon of interest */
538 ajint codonpos = 0; /* holds position of start of codon
539 of interest */
540
541 s = start+1;
542 e = pos+1;
543
544 /*
545 ** it is possible for an ORF in a circular genome to appear to start
546 ** past the end of the genome.
547 ** Move the reported positions back to start in the range 1..len
548 ** for readability.
549 */
550 while(s > len)
551 {
552 s -= len;
553 e -= len;
554 }
555
556 new = ajSeqNew();
557 if(find == N_STOP2STOP || find == N_START2STOP ||
558 find == AROUND_INIT_STOP || find == AROUND_END_STOP ||
559 find == AROUND_START)
560 ajSeqSetNuc(new);
561 else
562 ajSeqSetProt(new);
563
564
565 /*
566 ** Set the start and end positions to report and get the sequence for
567 ** the AROUND* sequences
568 */
569 if(find == AROUND_INIT_STOP)
570 {
571 codonpos = s-3;
572 s = codonpos - around; /* 50 before the initial STOP */
573 e = codonpos + around+2; /* 50 after the end of the STOP */
574 if(s < 1)
575 return;
576
577 if(e > seqlen)
578 return;
579 ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1);
580
581 }
582 else if(find == AROUND_START)
583 {
584 codonpos = s;
585 s = codonpos - around; /* 50 before the initial STOP */
586 e = codonpos + around+2; /* 50 after the end of the STOP */
587 if(s < 1)
588 return;
589
590 if(e > seqlen)
591 return;
592 ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1);
593
594 }
595 else if(find == AROUND_END_STOP)
596 {
597 codonpos = e+1;
598 s = codonpos - around; /* 50 before the initial STOP */
599 e = codonpos + around+2; /* 50 after the end of the STOP */
600 if(s < 1)
601 return;
602
603 if(e > seqlen)
604 return;
605 ajStrAssignSubS(&aroundstr, ajSeqGetSeqS(seq), s-1, e-1);
606 }
607
608
609
610 /* set the name and description */
611 ajStrAssignS(&name, ajSeqGetNameS(seq));
612 ajStrAppendC(&name, "_");
613
614 /* post-increment the ORF number for the next ORF */
615 ajStrFromInt(&value,(*orf_no)++);
616
617 ajStrAppendS(&name, value);
618 ajSeqAssignNameS(new, name);
619
620 /* set the description of the translation */
621 ajStrAssignC(&name, "[");
622
623 /* Reverse the reported positions if this is the reverse sense */
624 if(!sense)
625 {
626 s = len-s+1;
627 e = len-e+1;
628
629 /*
630 ** shift the positions back into the range 1..len as far as possible
631 ** without going into negative numbers
632 */
633 while(e > len)
634 {
635 s -= len;
636 e -= len;
637 }
638
639 while(e < 0 || s < 0)
640 {
641 s += len;
642 e += len;
643 }
644 }
645
646 /* the base before the stop codon (numbering bases from 1) */
647 ajStrFromInt(&value, s + ajSeqGetOffset(seq));
648
649 ajStrAppendS(&name, value);
650 ajStrAppendC(&name, " - ");
651
652 /* the base before the stop codon (numbering bases from 1) */
653 ajStrFromInt(&value, e + ajSeqGetOffset(seq));
654
655
656 ajStrAppendS(&name, value);
657 ajStrAppendC(&name, "] ");
658
659 /* make it clear if this is the reverse sense */
660 if(!sense)
661 ajStrAppendC(&name, "(REVERSE SENSE) ");
662
663
664 /*
665 ** make it clear if this is a circular genome and the ORF crosses
666 ** the breakpoint
667 */
668 if(s> len || e > len)
669 ajStrAppendC(&name, "(ORF crosses the breakpoint) ");
670
671
672 if(find == AROUND_INIT_STOP || find == AROUND_START ||
673 find == AROUND_END_STOP)
674 {
675 ajStrAppendC(&name, "Around codon at ");
676 ajStrFromInt(&value, codonpos);
677 ajStrAppendS(&name, value);
678 ajStrAppendC(&name, ". ");
679 }
680
681 ajStrAppendS(&name, ajSeqGetDescS(seq));
682 ajSeqAssignDescS(new, name);
683
684
685 /* replace newstr in new */
686 if(find == N_STOP2STOP || find == N_START2STOP || find == P_STOP2STOP ||
687 find == P_START2STOP)
688 ajSeqAssignSeqS(new, str);
689 else
690 /* sequence to be 50 bases around the codon */
691 ajSeqAssignSeqS(new, aroundstr);
692
693 ajSeqoutWriteSeq(seqout, new);
694
695 ajSeqDel(&new);
696 ajStrDel(&value);
697 ajStrDel(&name);
698
699 return;
700 }
701
702
703
704
705 /* @funcstatic getorf_AppORF **************************************************
706 **
707 ** append aa to ORF sequence string
708 **
709 ** @param [r] find [ajint] Find code
710 ** @param [u] str [AjPStr*] Sequence string
711 ** @param [r] chrseq [const char*] Undocumented
712 ** @param [r] pos [ajint] Codon triplet position in chrseq
713 ** @param [r] aa [char] Undocumented
714 ** @@
715 ******************************************************************************/
716
getorf_AppORF(ajint find,AjPStr * str,const char * chrseq,ajint pos,char aa)717 static void getorf_AppORF(ajint find, AjPStr *str,
718 const char *chrseq, ajint pos,
719 char aa)
720 {
721
722 if(find == N_STOP2STOP || find == N_START2STOP ||
723 find == AROUND_INIT_STOP || find == AROUND_END_STOP)
724 {
725 ajStrAppendK(str, chrseq[pos]);
726 ajStrAppendK(str, chrseq[pos+1]);
727 ajStrAppendK(str, chrseq[pos+2]);
728 }
729 else if(find == P_STOP2STOP || find == P_START2STOP ||
730 find == AROUND_START)
731 ajStrAppendK(str, aa);
732
733 return;
734 }
735