1 /*******************************************************************************
2 PRODIGAL (PROkaryotic DynamIc Programming Genefinding ALgorithm)
3 Copyright (C) 2007-2016 University of Tennessee / UT-Battelle
4
5 Code Author: Doug Hyatt
6
7 This program is free software: you can redistribute it and/or modify
8 it under the terms of the GNU General Public License as published by
9 the Free Software Foundation, either version 3 of the License, or
10 (at your option) any later version.
11
12 This program is distributed in the hope that it will be useful,
13 but WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 GNU General Public License for more details.
16
17 You should have received a copy of the GNU General Public License
18 along with this program. If not, see <http://www.gnu.org/licenses/>.
19 *******************************************************************************/
20
21 #include "dprog.h"
22
23 /*******************************************************************************
24 Basic dynamic programming routine for predicting genes. The 'flag' variable
25 is set to 0 for the initial dynamic programming routine based solely on GC
26 frame plot (used to construct a training set. If the flag is set to 1, the
27 routine does the final dynamic programming based on
28 coding, RBS scores, etc.
29 *******************************************************************************/
30
dprog(struct _node * nod,int nn,struct _training * tinf,int flag)31 int dprog(struct _node *nod, int nn, struct _training *tinf, int flag) {
32 int i, j, min, max_ndx = -1, path, nxt, tmp;
33 double max_sc = -1.0;
34
35 if(nn == 0) return -1;
36 for(i = 0; i < nn; i++) {
37 nod[i].score = 0;
38 nod[i].traceb = -1;
39 nod[i].tracef = -1;
40 }
41 for(i = 0; i < nn; i++) {
42
43 /* Set up distance constraints for making connections, */
44 /* but make exceptions for giant ORFS. */
45 if(i < MAX_NODE_DIST) min = 0; else min = i-MAX_NODE_DIST;
46 if(nod[i].strand == -1 && nod[i].type != STOP && nod[min].ndx >=
47 nod[i].stop_val)
48 while(min >= 0 && nod[i].ndx != nod[i].stop_val) min--;
49 if(nod[i].strand == 1 && nod[i].type == STOP && nod[min].ndx >=
50 nod[i].stop_val)
51 while(min >= 0 && nod[i].ndx != nod[i].stop_val) min--;
52 if(min < MAX_NODE_DIST) min = 0;
53 else min = min-MAX_NODE_DIST;
54 for(j = min; j < i; j++) {
55 score_connection(nod, j, i, tinf, flag);
56 }
57 }
58 for(i = nn-1; i >= 0; i--) {
59 if(nod[i].strand == 1 && nod[i].type != STOP) continue;
60 if(nod[i].strand == -1 && nod[i].type == STOP) continue;
61 if(nod[i].score > max_sc) { max_sc = nod[i].score; max_ndx = i; }
62 }
63
64 /* First Pass: untangle the triple overlaps */
65 path = max_ndx;
66 while(nod[path].traceb != -1) {
67 nxt = nod[path].traceb;
68 if(nod[path].strand == -1 && nod[path].type == STOP &&
69 nod[nxt].strand == 1 && nod[nxt].type == STOP &&
70 nod[path].ov_mark != -1 && nod[path].ndx > nod[nxt].ndx) {
71 tmp = nod[path].star_ptr[nod[path].ov_mark];
72 for(i = tmp; nod[i].ndx != nod[tmp].stop_val; i--);
73 nod[path].traceb = tmp;
74 nod[tmp].traceb = i;
75 nod[i].ov_mark = -1;
76 nod[i].traceb = nxt;
77 }
78 path = nod[path].traceb;
79 }
80
81 /* Second Pass: Untangle the simple overlaps */
82 path = max_ndx;
83 while(nod[path].traceb != -1) {
84 nxt = nod[path].traceb;
85 if(nod[path].strand == -1 && nod[path].type != STOP && nod[nxt].strand == 1
86 && nod[nxt].type == STOP) {
87 for(i = path; nod[i].ndx != nod[path].stop_val; i--);
88 nod[path].traceb = i; nod[i].traceb = nxt;
89 }
90 if(nod[path].strand == 1 && nod[path].type == STOP && nod[nxt].strand == 1
91 && nod[nxt].type == STOP) {
92 nod[path].traceb = nod[nxt].star_ptr[(nod[path].ndx)%3];
93 nod[nod[path].traceb].traceb = nxt;
94 }
95 if(nod[path].strand == -1 && nod[path].type == STOP && nod[nxt].strand == -1
96 && nod[nxt].type == STOP) {
97 nod[path].traceb = nod[path].star_ptr[(nod[nxt].ndx)%3];
98 nod[nod[path].traceb].traceb = nxt;
99 }
100 path = nod[path].traceb;
101 }
102
103 /* Mark forward pointers */
104 path = max_ndx;
105 while(nod[path].traceb != -1) {
106 nod[nod[path].traceb].tracef = path;
107 path = nod[path].traceb;
108 }
109
110 if(nod[max_ndx].traceb == -1) return -1;
111 else return max_ndx;
112 }
113
114 /*******************************************************************************
115 This routine scores the connection between two nodes, the most basic of which
116 is 5'fwd->3'fwd (gene) and 3'rev->5'rev (rev gene). If the connection ending
117 at n2 is the maximal scoring model, it updates the pointers in the dynamic
118 programming model. n3 is used to handle overlaps, i.e. cases where 5->3'
119 overlaps 5'->3' on the same strand. In this case, 3' connects directly to 3',
120 and n3 is used to untangle the 5' end of the second gene.
121 *******************************************************************************/
122
score_connection(struct _node * nod,int p1,int p2,struct _training * tinf,int flag)123 void score_connection(struct _node *nod, int p1, int p2, struct _training *tinf,
124 int flag) {
125 struct _node *n1 = &(nod[p1]), *n2 = &(nod[p2]), *n3;
126 int i, left = n1->ndx, right = n2->ndx, bnd, ovlp = 0, maxfr = -1;
127 double score = 0.0, scr_mod = 0.0, maxval;
128
129 /***********************/
130 /* Invalid Connections */
131 /***********************/
132
133 /* 5'fwd->5'fwd, 5'rev->5'rev */
134 if(n1->type != STOP && n2->type != STOP && n1->strand == n2->strand) return;
135
136 /* 5'fwd->5'rev, 5'fwd->3'rev */
137 else if(n1->strand == 1 && n1->type != STOP && n2->strand == -1) return;
138
139 /* 3'rev->5'fwd, 3'rev->3'fwd) */
140 else if(n1->strand == -1 && n1->type == STOP && n2->strand == 1) return;
141
142 /* 5'rev->3'fwd */
143 else if(n1->strand == -1 && n1->type != STOP && n2->strand == 1 && n2->type ==
144 STOP) return;
145
146 /******************/
147 /* Edge Artifacts */
148 /******************/
149 if(n1->traceb == -1 && n1->strand == 1 && n1->type == STOP) return;
150 if(n1->traceb == -1 && n1->strand == -1 && n1->type != STOP) return;
151
152 /*********/
153 /* Genes */
154 /*********/
155
156 /* 5'fwd->3'fwd */
157 else if(n1->strand == n2->strand && n1->strand == 1 && n1->type != STOP &&
158 n2->type == STOP) {
159 if(n2->stop_val >= n1->ndx) return;
160 if(n1->ndx % 3 != n2->ndx % 3) return;
161 right += 2;
162 if(flag == 0) scr_mod = tinf->bias[0]*n1->gc_score[0] +
163 tinf->bias[1]*n1->gc_score[1] + tinf->bias[2]*n1->gc_score[2];
164 else if(flag == 1) score = n1->cscore + n1->sscore;
165 }
166
167 /* 3'rev->5'rev */
168 else if(n1->strand == n2->strand && n1->strand == -1 && n1->type == STOP &&
169 n2->type != STOP) {
170 if(n1->stop_val <= n2->ndx) return;
171 if(n1->ndx % 3 != n2->ndx % 3) return;
172 left -= 2;
173 if(flag == 0) scr_mod = tinf->bias[0]*n2->gc_score[0] +
174 tinf->bias[1]*n2->gc_score[1] + tinf->bias[2]*n2->gc_score[2];
175 else if(flag == 1) score = n2->cscore + n2->sscore;
176 }
177
178 /********************************/
179 /* Intergenic Space (Noncoding) */
180 /********************************/
181
182 /* 3'fwd->5'fwd */
183 else if(n1->strand == 1 && n1->type == STOP && n2->strand == 1 && n2->type !=
184 STOP) {
185 left += 2;
186 if(left >= right) return;
187 if(flag == 1) score = intergenic_mod(n1, n2, tinf);
188 }
189
190 /* 3'fwd->3'rev */
191 else if(n1->strand == 1 && n1->type == STOP && n2->strand == -1 && n2->type ==
192 STOP) {
193 left += 2; right -= 2;
194 if(left >= right) return;
195 /* Overlapping Gene Case 2: Three consecutive overlapping genes f r r */
196 maxfr = -1; maxval = 0.0;
197 for(i = 0; i < 3; i++) {
198 if(n2->star_ptr[i] == -1) continue;
199 n3 = &(nod[n2->star_ptr[i]]);
200 ovlp = left - n3->stop_val + 3;
201 if(ovlp <= 0 || ovlp >= MAX_OPP_OVLP) continue;
202 if(ovlp >= n3->ndx - left) continue;
203 if(n1->traceb == -1) continue;
204 if(ovlp >= n3->stop_val - nod[n1->traceb].ndx - 2) continue;
205 if((flag == 1 && n3->cscore + n3->sscore + intergenic_mod(n3, n2, tinf) >
206 maxval) || (flag == 0 && tinf->bias[0]*n3->gc_score[0] +
207 tinf->bias[1]*n3->gc_score[1] + tinf->bias[2]*n3->gc_score[2] >
208 maxval)) {
209 maxfr = i;
210 maxval = n3->cscore + n3->sscore + intergenic_mod(n3, n2, tinf);
211 }
212 }
213 if(maxfr != -1) {
214 n3 = &(nod[n2->star_ptr[maxfr]]);
215 if(flag == 0) scr_mod = tinf->bias[0]*n3->gc_score[0] +
216 tinf->bias[1]*n3->gc_score[1] +
217 tinf->bias[2]*n3->gc_score[2];
218 else if(flag == 1) score = n3->cscore + n3->sscore +
219 intergenic_mod(n3, n2, tinf);
220 }
221 else if(flag == 1) score = intergenic_mod(n1, n2, tinf);
222 }
223
224 /* 5'rev->3'rev */
225 else if(n1->strand == -1 && n1->type != STOP && n2->strand == -1 && n2->type
226 == STOP) {
227 right -= 2;
228 if(left >= right) return;
229 if(flag == 1) score = intergenic_mod(n1, n2, tinf);
230 }
231
232 /* 5'rev->5'fwd */
233 else if(n1->strand == -1 && n1->type != STOP && n2->strand == 1 && n2->type
234 != STOP) {
235 if(left >= right) return;
236 if(flag == 1) score = intergenic_mod(n1, n2, tinf);
237 }
238
239 /********************/
240 /* Possible Operons */
241 /********************/
242
243 /* 3'fwd->3'fwd, check for a start just to left of first 3' */
244 else if(n1->strand == 1 && n2->strand == 1 && n1->type == STOP && n2->type ==
245 STOP) {
246 if(n2->stop_val >= n1->ndx) return;
247 if(n1->star_ptr[n2->ndx%3] == -1) return;
248 n3 = &(nod[n1->star_ptr[n2->ndx%3]]);
249 left = n3->ndx; right += 2;
250 if(flag == 0) scr_mod = tinf->bias[0]*n3->gc_score[0] +
251 tinf->bias[1]*n3->gc_score[1] + tinf->bias[2]*n3->gc_score[2];
252 else if(flag == 1) score = n3->cscore + n3->sscore +
253 intergenic_mod(n1, n3, tinf);
254 }
255
256 /* 3'rev->3'rev, check for a start just to right of second 3' */
257 else if(n1->strand == -1 && n1->type == STOP && n2->strand == -1 && n2->type
258 == STOP) {
259 if(n1->stop_val <= n2->ndx) return;
260 if(n2->star_ptr[n1->ndx%3] == -1) return;
261 n3 = &(nod[n2->star_ptr[n1->ndx%3]]);
262 left -= 2; right = n3->ndx;
263 if(flag == 0) scr_mod = tinf->bias[0]*n3->gc_score[0] +
264 tinf->bias[1]*n3->gc_score[1] + tinf->bias[2]*n3->gc_score[2];
265 else if(flag == 1) score = n3->cscore + n3->sscore +
266 intergenic_mod(n3, n2, tinf);
267 }
268
269 /***************************************/
270 /* Overlapping Opposite Strand 3' Ends */
271 /***************************************/
272
273 /* 3'for->5'rev */
274 else if(n1->strand == 1 && n1->type == STOP && n2->strand == -1 && n2->type
275 != STOP) {
276 if(n2->stop_val-2 >= n1->ndx+2) return;
277 ovlp = (n1->ndx+2) - (n2->stop_val-2) + 1;
278 if(ovlp >= MAX_OPP_OVLP) return;
279 if((n1->ndx+2 - n2->stop_val-2 + 1) >= (n2->ndx -n1->ndx+3 + 1)) return;
280 if(n1->traceb == -1) bnd = 0;
281 else bnd = nod[n1->traceb].ndx;
282 if((n1->ndx+2 - n2->stop_val-2 + 1) >= (n2->stop_val-3 - bnd + 1)) return;
283 left = n2->stop_val-2;
284 if(flag == 0) scr_mod = tinf->bias[0]*n2->gc_score[0] +
285 tinf->bias[1]*n2->gc_score[1] + tinf->bias[2]*n2->gc_score[2];
286 else if(flag == 1) score = n2->cscore + n2->sscore - 0.15*tinf->st_wt;
287 }
288
289 if(flag == 0) score = ((double)(right-left+1-(ovlp*2)))*scr_mod;
290
291 if(n1->score + score >= n2->score) {
292 n2->score = n1->score + score;
293 n2->traceb = p1;
294 n2->ov_mark = maxfr;
295 }
296
297 return;
298 }
299
300 /*******************************************************************************
301 Sometimes bad genes creep into the model due to the node distance constraint
302 in the dynamic programming routine. This routine just does a sweep through
303 the genes and eliminates ones with negative scores.
304 *******************************************************************************/
305
eliminate_bad_genes(struct _node * nod,int dbeg,struct _training * tinf)306 void eliminate_bad_genes(struct _node *nod, int dbeg, struct _training *tinf) {
307 int path;
308
309 if(dbeg == -1) return;
310 path = dbeg;
311 while(nod[path].traceb != -1) path = nod[path].traceb;
312 while(nod[path].tracef != -1) {
313 if(nod[path].strand == 1 && nod[path].type == STOP)
314 nod[nod[path].tracef].sscore += intergenic_mod(&nod[path],
315 &nod[nod[path].tracef], tinf);
316 if(nod[path].strand == -1 && nod[path].type != STOP)
317 nod[path].sscore += intergenic_mod(&nod[path], &nod[nod[path].tracef],
318 tinf);
319 path = nod[path].tracef;
320 }
321
322 path = dbeg;
323 while(nod[path].traceb != -1) path = nod[path].traceb;
324 while(nod[path].tracef != -1) {
325 if(nod[path].strand == 1 && nod[path].type != STOP &&
326 nod[path].cscore + nod[path].sscore < 0) {
327 nod[path].elim = 1; nod[nod[path].tracef].elim = 1;
328 }
329 if(nod[path].strand == -1 && nod[path].type == STOP &&
330 nod[nod[path].tracef].cscore + nod[nod[path].tracef].sscore < 0) {
331 nod[path].elim = 1; nod[nod[path].tracef].elim = 1;
332 }
333 path = nod[path].tracef;
334 }
335 }
336