1 
2 /* eufindtRNA  - Eukaryotic tRNA finder
3  *
4  * C implementation of algorithm described by Pavesi, Conterio,
5  * Bolchi, Dieci, & Ottonello in NAR 22:1247-56 (94)
6  * "Identification of new eukaryotic tRNA genes in genomic DNA
7  * databases by a multistep weight matix analysis of transcriptional
8  * control regions"
9  *
10  * To be used in tRNAscan-SE package to increase sensitivity by
11  * complementing tRNAscan 1.3 first-pass scan
12  *
13  * by Todd MJ Lowe    4/8/96
14  *
15  * Uses Sean Eddy's function library for biological sequence analysis
16  * (Squid v1.5g)
17  *
18  * v1.1: Small bug fixed 8/2000 that caused second of two consecutive tRNAs
19  * (within 40bp) to be missed if the second tRNA scored lower than the first
20  */
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include "squid.h"
26 #include "sqfuncs.h"
27 #include "eufind_const.h"
28 
29 char eufind_version[] = "1.1";
30 char eufind_date[]    = "Aug 2000";
31 
32 
33 #define OPTIONS "ho:l:X:I:rsDFi:"
34 char usage[] = "\n\
35 Usage: eufindtRNA [-options] <sequence file>\n\
36 Find tRNAs in eukaryotic sequences\n\n\
37 Available options:\n\
38 -h             : help - print version and usage info\n\
39 -o <outfile>   : save tRNAs in <outfile>\n\
40 -r             : relaxed mode (no terminators searched for)\n\
41 -s             : strict mode (discard tRNAs with missing terminators)\n\
42 -l <length>    : set max intron+variable loop length (default=140)\n\
43 -X <Score>     : manually set final score cutoff to <Score> (def=-31.8)\n\
44 -I <Score>     : set cutoff for intermediate score (def=-31.25)\n\
45 -D             : save tRNA score components (for Debugging)\n\
46 -F             : don't check for uppercase or DNA alphabet\n\
47 -i <integer>   : start nucleotide numbering at <integer> (def=1)\n\n";
48 
49 int
main(int argc,char ** argv)50 main (int argc, char **argv)
51 {
52   char  *seqfile;               /* file containing aligned seqs */
53   char  *outfile;               /* destination file for tRNAs   */
54   int       fmt;		/* format of seqfile  */
55   FILE     *outfp;                /* open outfile                 */
56   SQFILE   *seqfp;
57   SQINFO    sqinfo;
58   int       i, errno,
59     ShowScores,                 /* flag for type of info output when
60 				   saving tRNAs */
61     RelaxedMode,                /* flag for relaxed scanning mode, do
62 				   not look for poly T terminator
63 				   signal */
64     StrictMode,                 /* require poly T terminator */
65     NoReformat;                 /* flag to prevent extra work of
66 				/*   changing seqs to DNA & upper case */
67 				/*   alphabet */
68 
69   float  NoTermPenalty;         /* penalty val for tRNAs with no */
70                                 /* poly T terminator */
71 
72   long int sqoffset;           /* nucleotide numbering offset (set with -i param) */
73   char *seq, *revseq,                  /* sequence */
74     *iseq, *reviseq;           /* encoded seq & encoded reverse comp */
75   int strand,                  /* 1 for orig seq, -1 for rev comp */
76     seqidx;                    /* current position in seq */
77   float FirstScore,            /* initial (Bbox) logodds score  */
78     IntScore, TotScore,        /* cum tRNA logodds scores */
79     IntScoreCutoff,
80     TotScoreCutoff;            /* cutoff for reporting tRNAs */
81   TRNA_TYPE *tRNA, *prev_tRNA,
82     *swap_tRNA;                 /* current & previous tRNA info */
83   int Max_AB_dist;              /* max nuc. distance searched upstream */
84 				/* of candidate B boxes for A boxes */
85   int tRNA_ct;
86 
87   int          optchar;
88   extern char *optarg;
89   extern int   optind;
90 
91   /***********************************************
92    * Parse command line
93    ***********************************************/
94 
95   outfile    = NULL;
96   TotScoreCutoff = TOT_SCORE_THRESH;
97   IntScoreCutoff = INT_SCORE_THRESH;
98   Max_AB_dist = MIN_AB_BOX_DIST + AB_BOX_DIST_RANGE;
99   sqoffset = 0;
100   ShowScores = 0;
101   NoTermPenalty = MAX_PENALTY;
102   RelaxedMode = 0;
103   StrictMode = 0;
104   NoReformat = 0;
105 
106   while ((optchar = getopt(argc, argv, OPTIONS)) != -1)
107     switch (optchar) {
108 
109     case 'o': outfile    = optarg; break;
110     case 'h':
111       printf("eufindtRNA %s, %s\n%s\n", eufind_version, eufind_date, usage);
112       exit(EXIT_SUCCESS);
113     case 'r': RelaxedMode = 1; break;
114     case 's': NoTermPenalty = 10*MAX_PENALTY; StrictMode = 1; break;
115     case 'X': TotScoreCutoff = atof(optarg); break;
116     case 'D': ShowScores = 1; break;
117     case 'F': NoReformat = 1; break;
118     case 'l': Max_AB_dist = MIN_AB_BOX_DIST + atof(optarg); break;
119     case 'I': IntScoreCutoff = atof(optarg); break;
120     case 'i': sqoffset = atof(optarg)-1; break;
121     default:
122       Die("%s\n", usage);
123     }
124 
125   if (argc -optind != 1)
126     Die("Wrong number of arguments specified on command line\n%s\n", usage);
127 
128   seqfile = argv[optind];
129 
130   if (outfile == NULL)
131     outfp = stdout;
132   else if ((outfp = fopen(outfile, "w")) == NULL)
133     Die("Failed to open tRNA output file %s", outfile);
134 
135 
136   if ((tRNA = (TRNA_TYPE *) malloc (sizeof(TRNA_TYPE))) == NULL)
137     Die("Memory failure, couldn't allocate tRNA memory\n");
138   if ((prev_tRNA = (TRNA_TYPE *) malloc (sizeof(TRNA_TYPE))) == NULL)
139     Die("Memory failure, couldn't allocate tRNA memory\n");
140 
141 
142   /***********************************************
143    * Determine seq format & open for reading     *
144    ***********************************************/
145 
146   if (! SeqfileFormat(seqfile, &fmt, NULL))
147     Die("Can't determine format of file %s\n", seqfile);
148   if ((seqfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
149     Die("Failed to open sequence file %s for reading", seqfile);
150 
151   while (ReadSeq(seqfp, fmt, &seq, &sqinfo)) {
152 
153     if (ShowScores)
154       printf ("Seq: %s\n",sqinfo.name);
155 
156     tRNA_ct = 0;
157 
158     if (!NoReformat) {
159       ToDNA(seq);
160       s2upper(seq);
161     }
162 
163     /* allocate mem for integer-encoded seq (A=0,C=1,G=2,T=3) */
164     if ((iseq = calloc (sqinfo.len+2, sizeof(char))) == NULL)
165       Die("Memory failure, couldn't allocate sequence\n");
166 
167     /* integer-encode sequence */
168     if (errno = IntEncodeSeq(iseq,seq,sqinfo.len))
169       Die("Unable to encode sequence %s at base %d\n",
170 	  sqinfo.name,errno);
171 
172     /* Search both strands (0=top strand, -1=bottom strand)  */
173     for (strand=0; strand >= -1; strand--) {
174 
175       Init_tRNA(prev_tRNA);     /* clear previous tRNA */
176 
177       /* take reverse complement of encoded seq if searching bottom strand */
178       if (strand == -1) {
179 	if ((revseq = calloc (sqinfo.len+2, sizeof(char))) == NULL)
180 	  Die("Memory failure, couldn't allocate reverse sequence\n");
181 	revcomp(revseq, seq);
182 	free(seq);
183 	seq = revseq;
184 	if (IntEncodeSeq(iseq,seq,sqinfo.len))
185 	  Die("Unable to encode sequence\n");
186       }
187 
188 
189       /***********************************************
190        * Find transcriptional promotor elements      *
191        ***********************************************/
192 
193       seqidx=BBOX_START_IDX-2;
194       while (GetBbox(&FirstScore,&seqidx,iseq,sqinfo.len,strand,
195 		     ShowScores)) {
196 
197 	Init_tRNA(tRNA);
198 	tRNA->Bbox_st = seqidx;
199 	tRNA->Bbox_end = seqidx + BBOX_LEN-1;
200 	tRNA->BboxSc = FirstScore;
201 
202 	if (((FirstScore >= SEC_LOBOUND) &&
203 	     (FirstScore <= SEC_HIBOUND))
204 	    && (GetSecABox(tRNA,seq))){
205 
206 	  if (!GetBestTrxTerm(tRNA,seq,sqinfo.len,NoTermPenalty) &&
207 	      StrictMode)
208 	    continue;       /* look for next B box */
209 
210 	  strcpy(tRNA->acodon,"TCA");
211 	  /*  tRNA->Bbox_end++; */
212 	  tRNA->totSc = FirstScore;
213 	}
214 
215 	else {           /* Searching for non-SelCys tRNA */
216 
217 
218 	  GetBestABox(tRNA,seq,iseq,sqinfo.len,strand,ShowScores,
219 		      Max_AB_dist,prev_tRNA->Abox_st);
220 	  IntScore = tRNA->AboxSc + tRNA->BboxSc + tRNA->ABdistSc;
221 	  if (IntScore < IntScoreCutoff)
222 	    continue;               /* look for next B box */
223 
224 	  if (!RelaxedMode) {
225 	    if (!GetBestTrxTerm(tRNA,seq,sqinfo.len,NoTermPenalty) &&
226 		StrictMode)
227 	      continue;
228 	    TotScore = IntScore + tRNA->TermSc;
229 	    if (TotScore < TotScoreCutoff)
230 	      continue;               /* look for next B box */
231 
232 	    tRNA->totSc = TotScore;
233 	  }
234 	  else
235 	    tRNA->totSc = IntScore;
236 
237 	}
238 
239 	Get_tRNA_stats(tRNA,seq,sqinfo.len,strand);
240 
241 	if (tRNAOverlap(tRNA,prev_tRNA,strand)) {
242 
243 	  if  (tRNA->totSc < prev_tRNA->totSc) {
244 	    Init_tRNA(tRNA);  /* skip repeat tRNA */
245 	  }
246 	  else {           /* swap, but don't save yet */
247 
248 	    swap_tRNA = prev_tRNA;
249 	    prev_tRNA = tRNA;
250 	    Init_tRNA(swap_tRNA);
251 	    tRNA = swap_tRNA;
252 	  }
253 	}
254 	else {             /* no overlap, save & then swap */
255 
256 	  if (prev_tRNA->start > 0) {
257 	    prev_tRNA->idno = ++tRNA_ct;
258 	    Save_tRNA(prev_tRNA,&sqinfo,seq,strand,ShowScores,sqoffset);
259 	  }
260 	  swap_tRNA = prev_tRNA;
261 	  prev_tRNA = tRNA;
262 	  Init_tRNA(swap_tRNA);
263 	  tRNA = swap_tRNA;
264 	}
265 
266       }  /* find B box */
267 
268       /* save last buffered tRNA before going to other strand */
269 
270       if (prev_tRNA->start > 0) {
271 	prev_tRNA->idno = ++tRNA_ct;
272 	Save_tRNA(prev_tRNA,&sqinfo,seq,strand,ShowScores,sqoffset);
273       }
274 
275     }   /* search both strands  */
276 
277     FreeSequence(seq, &sqinfo);
278     free(iseq);
279   }
280 
281   SeqfileClose(seqfp);
282   fclose(outfp);
283   return 0;
284 }
285 
286 
287 
288 
289 
290 
291 
292 
293 
294 
295