1
2 /* eufindtRNA - Eukaryotic tRNA finder
3 *
4 * C implementation of algorithm described by Pavesi, Conterio,
5 * Bolchi, Dieci, & Ottonello in NAR 22:1247-56 (94)
6 * "Identification of new eukaryotic tRNA genes in genomic DNA
7 * databases by a multistep weight matix analysis of transcriptional
8 * control regions"
9 *
10 * To be used in tRNAscan-SE package to increase sensitivity by
11 * complementing tRNAscan 1.3 first-pass scan
12 *
13 * by Todd MJ Lowe 4/8/96
14 *
15 * Uses Sean Eddy's function library for biological sequence analysis
16 * (Squid v1.5g)
17 *
18 * v1.1: Small bug fixed 8/2000 that caused second of two consecutive tRNAs
19 * (within 40bp) to be missed if the second tRNA scored lower than the first
20 */
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include "squid.h"
26 #include "sqfuncs.h"
27 #include "eufind_const.h"
28
29 char eufind_version[] = "1.1";
30 char eufind_date[] = "Aug 2000";
31
32
33 #define OPTIONS "ho:l:X:I:rsDFi:"
34 char usage[] = "\n\
35 Usage: eufindtRNA [-options] <sequence file>\n\
36 Find tRNAs in eukaryotic sequences\n\n\
37 Available options:\n\
38 -h : help - print version and usage info\n\
39 -o <outfile> : save tRNAs in <outfile>\n\
40 -r : relaxed mode (no terminators searched for)\n\
41 -s : strict mode (discard tRNAs with missing terminators)\n\
42 -l <length> : set max intron+variable loop length (default=140)\n\
43 -X <Score> : manually set final score cutoff to <Score> (def=-31.8)\n\
44 -I <Score> : set cutoff for intermediate score (def=-31.25)\n\
45 -D : save tRNA score components (for Debugging)\n\
46 -F : don't check for uppercase or DNA alphabet\n\
47 -i <integer> : start nucleotide numbering at <integer> (def=1)\n\n";
48
49 int
main(int argc,char ** argv)50 main (int argc, char **argv)
51 {
52 char *seqfile; /* file containing aligned seqs */
53 char *outfile; /* destination file for tRNAs */
54 int fmt; /* format of seqfile */
55 FILE *outfp; /* open outfile */
56 SQFILE *seqfp;
57 SQINFO sqinfo;
58 int i, errno,
59 ShowScores, /* flag for type of info output when
60 saving tRNAs */
61 RelaxedMode, /* flag for relaxed scanning mode, do
62 not look for poly T terminator
63 signal */
64 StrictMode, /* require poly T terminator */
65 NoReformat; /* flag to prevent extra work of
66 /* changing seqs to DNA & upper case */
67 /* alphabet */
68
69 float NoTermPenalty; /* penalty val for tRNAs with no */
70 /* poly T terminator */
71
72 long int sqoffset; /* nucleotide numbering offset (set with -i param) */
73 char *seq, *revseq, /* sequence */
74 *iseq, *reviseq; /* encoded seq & encoded reverse comp */
75 int strand, /* 1 for orig seq, -1 for rev comp */
76 seqidx; /* current position in seq */
77 float FirstScore, /* initial (Bbox) logodds score */
78 IntScore, TotScore, /* cum tRNA logodds scores */
79 IntScoreCutoff,
80 TotScoreCutoff; /* cutoff for reporting tRNAs */
81 TRNA_TYPE *tRNA, *prev_tRNA,
82 *swap_tRNA; /* current & previous tRNA info */
83 int Max_AB_dist; /* max nuc. distance searched upstream */
84 /* of candidate B boxes for A boxes */
85 int tRNA_ct;
86
87 int optchar;
88 extern char *optarg;
89 extern int optind;
90
91 /***********************************************
92 * Parse command line
93 ***********************************************/
94
95 outfile = NULL;
96 TotScoreCutoff = TOT_SCORE_THRESH;
97 IntScoreCutoff = INT_SCORE_THRESH;
98 Max_AB_dist = MIN_AB_BOX_DIST + AB_BOX_DIST_RANGE;
99 sqoffset = 0;
100 ShowScores = 0;
101 NoTermPenalty = MAX_PENALTY;
102 RelaxedMode = 0;
103 StrictMode = 0;
104 NoReformat = 0;
105
106 while ((optchar = getopt(argc, argv, OPTIONS)) != -1)
107 switch (optchar) {
108
109 case 'o': outfile = optarg; break;
110 case 'h':
111 printf("eufindtRNA %s, %s\n%s\n", eufind_version, eufind_date, usage);
112 exit(EXIT_SUCCESS);
113 case 'r': RelaxedMode = 1; break;
114 case 's': NoTermPenalty = 10*MAX_PENALTY; StrictMode = 1; break;
115 case 'X': TotScoreCutoff = atof(optarg); break;
116 case 'D': ShowScores = 1; break;
117 case 'F': NoReformat = 1; break;
118 case 'l': Max_AB_dist = MIN_AB_BOX_DIST + atof(optarg); break;
119 case 'I': IntScoreCutoff = atof(optarg); break;
120 case 'i': sqoffset = atof(optarg)-1; break;
121 default:
122 Die("%s\n", usage);
123 }
124
125 if (argc -optind != 1)
126 Die("Wrong number of arguments specified on command line\n%s\n", usage);
127
128 seqfile = argv[optind];
129
130 if (outfile == NULL)
131 outfp = stdout;
132 else if ((outfp = fopen(outfile, "w")) == NULL)
133 Die("Failed to open tRNA output file %s", outfile);
134
135
136 if ((tRNA = (TRNA_TYPE *) malloc (sizeof(TRNA_TYPE))) == NULL)
137 Die("Memory failure, couldn't allocate tRNA memory\n");
138 if ((prev_tRNA = (TRNA_TYPE *) malloc (sizeof(TRNA_TYPE))) == NULL)
139 Die("Memory failure, couldn't allocate tRNA memory\n");
140
141
142 /***********************************************
143 * Determine seq format & open for reading *
144 ***********************************************/
145
146 if (! SeqfileFormat(seqfile, &fmt, NULL))
147 Die("Can't determine format of file %s\n", seqfile);
148 if ((seqfp = SeqfileOpen(seqfile, fmt, NULL)) == NULL)
149 Die("Failed to open sequence file %s for reading", seqfile);
150
151 while (ReadSeq(seqfp, fmt, &seq, &sqinfo)) {
152
153 if (ShowScores)
154 printf ("Seq: %s\n",sqinfo.name);
155
156 tRNA_ct = 0;
157
158 if (!NoReformat) {
159 ToDNA(seq);
160 s2upper(seq);
161 }
162
163 /* allocate mem for integer-encoded seq (A=0,C=1,G=2,T=3) */
164 if ((iseq = calloc (sqinfo.len+2, sizeof(char))) == NULL)
165 Die("Memory failure, couldn't allocate sequence\n");
166
167 /* integer-encode sequence */
168 if (errno = IntEncodeSeq(iseq,seq,sqinfo.len))
169 Die("Unable to encode sequence %s at base %d\n",
170 sqinfo.name,errno);
171
172 /* Search both strands (0=top strand, -1=bottom strand) */
173 for (strand=0; strand >= -1; strand--) {
174
175 Init_tRNA(prev_tRNA); /* clear previous tRNA */
176
177 /* take reverse complement of encoded seq if searching bottom strand */
178 if (strand == -1) {
179 if ((revseq = calloc (sqinfo.len+2, sizeof(char))) == NULL)
180 Die("Memory failure, couldn't allocate reverse sequence\n");
181 revcomp(revseq, seq);
182 free(seq);
183 seq = revseq;
184 if (IntEncodeSeq(iseq,seq,sqinfo.len))
185 Die("Unable to encode sequence\n");
186 }
187
188
189 /***********************************************
190 * Find transcriptional promotor elements *
191 ***********************************************/
192
193 seqidx=BBOX_START_IDX-2;
194 while (GetBbox(&FirstScore,&seqidx,iseq,sqinfo.len,strand,
195 ShowScores)) {
196
197 Init_tRNA(tRNA);
198 tRNA->Bbox_st = seqidx;
199 tRNA->Bbox_end = seqidx + BBOX_LEN-1;
200 tRNA->BboxSc = FirstScore;
201
202 if (((FirstScore >= SEC_LOBOUND) &&
203 (FirstScore <= SEC_HIBOUND))
204 && (GetSecABox(tRNA,seq))){
205
206 if (!GetBestTrxTerm(tRNA,seq,sqinfo.len,NoTermPenalty) &&
207 StrictMode)
208 continue; /* look for next B box */
209
210 strcpy(tRNA->acodon,"TCA");
211 /* tRNA->Bbox_end++; */
212 tRNA->totSc = FirstScore;
213 }
214
215 else { /* Searching for non-SelCys tRNA */
216
217
218 GetBestABox(tRNA,seq,iseq,sqinfo.len,strand,ShowScores,
219 Max_AB_dist,prev_tRNA->Abox_st);
220 IntScore = tRNA->AboxSc + tRNA->BboxSc + tRNA->ABdistSc;
221 if (IntScore < IntScoreCutoff)
222 continue; /* look for next B box */
223
224 if (!RelaxedMode) {
225 if (!GetBestTrxTerm(tRNA,seq,sqinfo.len,NoTermPenalty) &&
226 StrictMode)
227 continue;
228 TotScore = IntScore + tRNA->TermSc;
229 if (TotScore < TotScoreCutoff)
230 continue; /* look for next B box */
231
232 tRNA->totSc = TotScore;
233 }
234 else
235 tRNA->totSc = IntScore;
236
237 }
238
239 Get_tRNA_stats(tRNA,seq,sqinfo.len,strand);
240
241 if (tRNAOverlap(tRNA,prev_tRNA,strand)) {
242
243 if (tRNA->totSc < prev_tRNA->totSc) {
244 Init_tRNA(tRNA); /* skip repeat tRNA */
245 }
246 else { /* swap, but don't save yet */
247
248 swap_tRNA = prev_tRNA;
249 prev_tRNA = tRNA;
250 Init_tRNA(swap_tRNA);
251 tRNA = swap_tRNA;
252 }
253 }
254 else { /* no overlap, save & then swap */
255
256 if (prev_tRNA->start > 0) {
257 prev_tRNA->idno = ++tRNA_ct;
258 Save_tRNA(prev_tRNA,&sqinfo,seq,strand,ShowScores,sqoffset);
259 }
260 swap_tRNA = prev_tRNA;
261 prev_tRNA = tRNA;
262 Init_tRNA(swap_tRNA);
263 tRNA = swap_tRNA;
264 }
265
266 } /* find B box */
267
268 /* save last buffered tRNA before going to other strand */
269
270 if (prev_tRNA->start > 0) {
271 prev_tRNA->idno = ++tRNA_ct;
272 Save_tRNA(prev_tRNA,&sqinfo,seq,strand,ShowScores,sqoffset);
273 }
274
275 } /* search both strands */
276
277 FreeSequence(seq, &sqinfo);
278 free(iseq);
279 }
280
281 SeqfileClose(seqfp);
282 fclose(outfp);
283 return 0;
284 }
285
286
287
288
289
290
291
292
293
294
295