1 /*******************************************************************************
2     PRODIGAL (PROkaryotic DynamIc Programming Genefinding ALgorithm)
3     Copyright (C) 2007-2016 University of Tennessee / UT-Battelle
4 
5     Code Author:  Doug Hyatt
6 
7     This program is free software: you can redistribute it and/or modify
8     it under the terms of the GNU General Public License as published by
9     the Free Software Foundation, either version 3 of the License, or
10     (at your option) any later version.
11 
12     This program is distributed in the hope that it will be useful,
13     but WITHOUT ANY WARRANTY; without even the implied warranty of
14     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15     GNU General Public License for more details.
16 
17     You should have received a copy of the GNU General Public License
18     along with this program.  If not, see <http://www.gnu.org/licenses/>.
19 *******************************************************************************/
20 
21 #include "dprog.h"
22 
23 /*******************************************************************************
24   Basic dynamic programming routine for predicting genes.  The 'flag' variable
25   is set to 0 for the initial dynamic programming routine based solely on GC
26   frame plot (used to construct a training set.  If the flag is set to 1, the
27   routine does the final dynamic programming based on
28   coding, RBS scores, etc.
29 *******************************************************************************/
30 
dprog(struct _node * nod,int nn,struct _training * tinf,int flag)31 int dprog(struct _node *nod, int nn, struct _training *tinf, int flag) {
32   int i, j, min, max_ndx = -1, path, nxt, tmp;
33   double max_sc = -1.0;
34 
35   if(nn == 0) return -1;
36   for(i = 0; i < nn; i++) {
37     nod[i].score = 0;
38     nod[i].traceb = -1;
39     nod[i].tracef = -1;
40   }
41   for(i = 0; i < nn; i++) {
42 
43     /* Set up distance constraints for making connections, */
44     /* but make exceptions for giant ORFS.                 */
45     if(i < MAX_NODE_DIST) min = 0; else min = i-MAX_NODE_DIST;
46     if(nod[i].strand == -1 && nod[i].type != STOP && nod[min].ndx >=
47        nod[i].stop_val)
48       while(min >= 0 && nod[i].ndx != nod[i].stop_val) min--;
49     if(nod[i].strand == 1 && nod[i].type == STOP && nod[min].ndx >=
50        nod[i].stop_val)
51       while(min >= 0 && nod[i].ndx != nod[i].stop_val) min--;
52     if(min < MAX_NODE_DIST) min = 0;
53     else min = min-MAX_NODE_DIST;
54     for(j = min; j < i; j++) {
55       score_connection(nod, j, i, tinf, flag);
56     }
57   }
58   for(i = nn-1; i >= 0; i--) {
59     if(nod[i].strand == 1 && nod[i].type != STOP) continue;
60     if(nod[i].strand == -1 && nod[i].type == STOP) continue;
61     if(nod[i].score > max_sc) { max_sc = nod[i].score; max_ndx = i; }
62   }
63 
64   /* First Pass: untangle the triple overlaps */
65   path = max_ndx;
66   while(nod[path].traceb != -1) {
67     nxt = nod[path].traceb;
68     if(nod[path].strand == -1 && nod[path].type == STOP &&
69       nod[nxt].strand == 1 && nod[nxt].type == STOP &&
70       nod[path].ov_mark != -1 && nod[path].ndx > nod[nxt].ndx) {
71       tmp = nod[path].star_ptr[nod[path].ov_mark];
72       for(i = tmp; nod[i].ndx != nod[tmp].stop_val; i--);
73       nod[path].traceb = tmp;
74       nod[tmp].traceb = i;
75       nod[i].ov_mark = -1;
76       nod[i].traceb = nxt;
77     }
78     path = nod[path].traceb;
79   }
80 
81   /* Second Pass: Untangle the simple overlaps */
82   path = max_ndx;
83   while(nod[path].traceb != -1) {
84     nxt = nod[path].traceb;
85     if(nod[path].strand == -1 && nod[path].type != STOP && nod[nxt].strand == 1
86        && nod[nxt].type == STOP) {
87       for(i = path; nod[i].ndx != nod[path].stop_val; i--);
88       nod[path].traceb = i; nod[i].traceb = nxt;
89     }
90     if(nod[path].strand == 1 && nod[path].type == STOP && nod[nxt].strand == 1
91        && nod[nxt].type == STOP) {
92       nod[path].traceb = nod[nxt].star_ptr[(nod[path].ndx)%3];
93       nod[nod[path].traceb].traceb = nxt;
94     }
95     if(nod[path].strand == -1 && nod[path].type == STOP && nod[nxt].strand == -1
96        && nod[nxt].type == STOP) {
97       nod[path].traceb = nod[path].star_ptr[(nod[nxt].ndx)%3];
98       nod[nod[path].traceb].traceb = nxt;
99     }
100     path = nod[path].traceb;
101   }
102 
103   /* Mark forward pointers */
104   path = max_ndx;
105   while(nod[path].traceb != -1) {
106     nod[nod[path].traceb].tracef = path;
107     path = nod[path].traceb;
108   }
109 
110   if(nod[max_ndx].traceb == -1) return -1;
111   else return max_ndx;
112 }
113 
114 /*******************************************************************************
115   This routine scores the connection between two nodes, the most basic of which
116   is 5'fwd->3'fwd (gene) and 3'rev->5'rev (rev gene).  If the connection ending
117   at n2 is the maximal scoring model, it updates the pointers in the dynamic
118   programming model.  n3 is used to handle overlaps, i.e. cases where 5->3'
119   overlaps 5'->3' on the same strand.  In this case, 3' connects directly to 3',
120   and n3 is used to untangle the 5' end of the second gene.
121 *******************************************************************************/
122 
score_connection(struct _node * nod,int p1,int p2,struct _training * tinf,int flag)123 void score_connection(struct _node *nod, int p1, int p2, struct _training *tinf,
124                       int flag) {
125   struct _node *n1 = &(nod[p1]), *n2 = &(nod[p2]), *n3;
126   int i, left = n1->ndx, right = n2->ndx, bnd, ovlp = 0, maxfr = -1;
127   double score = 0.0, scr_mod = 0.0, maxval;
128 
129   /***********************/
130   /* Invalid Connections */
131   /***********************/
132 
133   /* 5'fwd->5'fwd, 5'rev->5'rev */
134   if(n1->type != STOP && n2->type != STOP && n1->strand == n2->strand) return;
135 
136   /* 5'fwd->5'rev, 5'fwd->3'rev */
137   else if(n1->strand == 1 && n1->type != STOP && n2->strand == -1) return;
138 
139   /* 3'rev->5'fwd, 3'rev->3'fwd) */
140   else if(n1->strand == -1 && n1->type == STOP && n2->strand == 1) return;
141 
142   /* 5'rev->3'fwd */
143   else if(n1->strand == -1 && n1->type != STOP && n2->strand == 1 && n2->type ==
144           STOP) return;
145 
146   /******************/
147   /* Edge Artifacts */
148   /******************/
149   if(n1->traceb == -1 && n1->strand == 1 && n1->type == STOP) return;
150   if(n1->traceb == -1 && n1->strand == -1 && n1->type != STOP) return;
151 
152   /*********/
153   /* Genes */
154   /*********/
155 
156   /* 5'fwd->3'fwd */
157   else if(n1->strand == n2->strand && n1->strand == 1 && n1->type != STOP &&
158           n2->type == STOP) {
159     if(n2->stop_val >= n1->ndx) return;
160     if(n1->ndx % 3 != n2->ndx % 3) return;
161     right += 2;
162     if(flag == 0) scr_mod = tinf->bias[0]*n1->gc_score[0] +
163                   tinf->bias[1]*n1->gc_score[1] + tinf->bias[2]*n1->gc_score[2];
164     else if(flag == 1) score = n1->cscore + n1->sscore;
165   }
166 
167   /* 3'rev->5'rev */
168   else if(n1->strand == n2->strand && n1->strand == -1 && n1->type == STOP &&
169           n2->type != STOP) {
170     if(n1->stop_val <= n2->ndx) return;
171     if(n1->ndx % 3 != n2->ndx % 3) return;
172     left -= 2;
173     if(flag == 0) scr_mod = tinf->bias[0]*n2->gc_score[0] +
174                   tinf->bias[1]*n2->gc_score[1] + tinf->bias[2]*n2->gc_score[2];
175     else if(flag == 1) score = n2->cscore + n2->sscore;
176   }
177 
178   /********************************/
179   /* Intergenic Space (Noncoding) */
180   /********************************/
181 
182   /* 3'fwd->5'fwd */
183   else if(n1->strand == 1 && n1->type == STOP && n2->strand == 1 && n2->type !=
184           STOP) {
185     left += 2;
186     if(left >= right) return;
187     if(flag == 1) score = intergenic_mod(n1, n2, tinf);
188   }
189 
190   /* 3'fwd->3'rev */
191   else if(n1->strand == 1 && n1->type == STOP && n2->strand == -1 && n2->type ==
192           STOP) {
193     left += 2; right -= 2;
194     if(left >= right) return;
195     /* Overlapping Gene Case 2: Three consecutive overlapping genes f r r */
196     maxfr = -1; maxval = 0.0;
197     for(i = 0; i < 3; i++) {
198       if(n2->star_ptr[i] == -1) continue;
199       n3 = &(nod[n2->star_ptr[i]]);
200       ovlp = left - n3->stop_val + 3;
201       if(ovlp <= 0 || ovlp >= MAX_OPP_OVLP) continue;
202       if(ovlp >= n3->ndx - left) continue;
203       if(n1->traceb == -1) continue;
204       if(ovlp >= n3->stop_val - nod[n1->traceb].ndx - 2) continue;
205       if((flag == 1 && n3->cscore + n3->sscore + intergenic_mod(n3, n2, tinf) >
206           maxval) || (flag == 0 && tinf->bias[0]*n3->gc_score[0] +
207           tinf->bias[1]*n3->gc_score[1] + tinf->bias[2]*n3->gc_score[2] >
208           maxval)) {
209         maxfr = i;
210         maxval = n3->cscore + n3->sscore + intergenic_mod(n3, n2, tinf);
211       }
212     }
213     if(maxfr != -1) {
214       n3 = &(nod[n2->star_ptr[maxfr]]);
215       if(flag == 0) scr_mod = tinf->bias[0]*n3->gc_score[0] +
216                     tinf->bias[1]*n3->gc_score[1] +
217                     tinf->bias[2]*n3->gc_score[2];
218       else if(flag == 1) score = n3->cscore + n3->sscore +
219               intergenic_mod(n3, n2, tinf);
220     }
221     else if(flag == 1) score = intergenic_mod(n1, n2, tinf);
222   }
223 
224   /* 5'rev->3'rev */
225   else if(n1->strand == -1 && n1->type != STOP && n2->strand == -1 && n2->type
226           == STOP) {
227     right -= 2;
228     if(left >= right) return;
229     if(flag == 1) score = intergenic_mod(n1, n2, tinf);
230   }
231 
232   /* 5'rev->5'fwd */
233   else if(n1->strand == -1 && n1->type != STOP && n2->strand == 1 && n2->type
234           != STOP) {
235     if(left >= right) return;
236     if(flag == 1) score = intergenic_mod(n1, n2, tinf);
237   }
238 
239   /********************/
240   /* Possible Operons */
241   /********************/
242 
243   /* 3'fwd->3'fwd, check for a start just to left of first 3' */
244   else if(n1->strand == 1 && n2->strand == 1 && n1->type == STOP && n2->type ==
245           STOP) {
246     if(n2->stop_val >= n1->ndx) return;
247     if(n1->star_ptr[n2->ndx%3] == -1) return;
248     n3 = &(nod[n1->star_ptr[n2->ndx%3]]);
249     left = n3->ndx; right += 2;
250     if(flag == 0) scr_mod = tinf->bias[0]*n3->gc_score[0] +
251                   tinf->bias[1]*n3->gc_score[1] + tinf->bias[2]*n3->gc_score[2];
252     else if(flag == 1) score = n3->cscore + n3->sscore +
253                        intergenic_mod(n1, n3, tinf);
254   }
255 
256   /* 3'rev->3'rev, check for a start just to right of second 3' */
257   else if(n1->strand == -1 && n1->type == STOP && n2->strand == -1 && n2->type
258           == STOP) {
259     if(n1->stop_val <= n2->ndx) return;
260     if(n2->star_ptr[n1->ndx%3] == -1) return;
261     n3 = &(nod[n2->star_ptr[n1->ndx%3]]);
262     left -= 2; right = n3->ndx;
263     if(flag == 0) scr_mod = tinf->bias[0]*n3->gc_score[0] +
264                   tinf->bias[1]*n3->gc_score[1] + tinf->bias[2]*n3->gc_score[2];
265     else if(flag == 1) score = n3->cscore + n3->sscore +
266                        intergenic_mod(n3, n2, tinf);
267   }
268 
269   /***************************************/
270   /* Overlapping Opposite Strand 3' Ends */
271   /***************************************/
272 
273   /* 3'for->5'rev */
274   else if(n1->strand == 1 && n1->type == STOP && n2->strand == -1 && n2->type
275           != STOP) {
276     if(n2->stop_val-2 >= n1->ndx+2) return;
277     ovlp = (n1->ndx+2) - (n2->stop_val-2) + 1;
278     if(ovlp >= MAX_OPP_OVLP) return;
279     if((n1->ndx+2 - n2->stop_val-2 + 1) >= (n2->ndx -n1->ndx+3 + 1)) return;
280     if(n1->traceb == -1) bnd = 0;
281     else bnd = nod[n1->traceb].ndx;
282     if((n1->ndx+2 - n2->stop_val-2 + 1) >= (n2->stop_val-3 - bnd + 1)) return;
283     left = n2->stop_val-2;
284     if(flag == 0) scr_mod = tinf->bias[0]*n2->gc_score[0] +
285                   tinf->bias[1]*n2->gc_score[1] + tinf->bias[2]*n2->gc_score[2];
286     else if(flag == 1) score = n2->cscore + n2->sscore - 0.15*tinf->st_wt;
287   }
288 
289   if(flag == 0) score = ((double)(right-left+1-(ovlp*2)))*scr_mod;
290 
291   if(n1->score + score >= n2->score) {
292     n2->score = n1->score + score;
293     n2->traceb = p1;
294     n2->ov_mark = maxfr;
295   }
296 
297   return;
298 }
299 
300 /*******************************************************************************
301   Sometimes bad genes creep into the model due to the node distance constraint
302   in the dynamic programming routine.  This routine just does a sweep through
303   the genes and eliminates ones with negative scores.
304 *******************************************************************************/
305 
eliminate_bad_genes(struct _node * nod,int dbeg,struct _training * tinf)306 void eliminate_bad_genes(struct _node *nod, int dbeg, struct _training *tinf) {
307   int path;
308 
309   if(dbeg == -1) return;
310   path = dbeg;
311   while(nod[path].traceb != -1) path = nod[path].traceb;
312   while(nod[path].tracef != -1) {
313     if(nod[path].strand == 1 && nod[path].type == STOP)
314       nod[nod[path].tracef].sscore += intergenic_mod(&nod[path],
315                                       &nod[nod[path].tracef], tinf);
316     if(nod[path].strand == -1 && nod[path].type != STOP)
317       nod[path].sscore += intergenic_mod(&nod[path], &nod[nod[path].tracef],
318                           tinf);
319     path = nod[path].tracef;
320   }
321 
322   path = dbeg;
323   while(nod[path].traceb != -1) path = nod[path].traceb;
324   while(nod[path].tracef != -1) {
325     if(nod[path].strand == 1 && nod[path].type != STOP &&
326        nod[path].cscore + nod[path].sscore < 0) {
327       nod[path].elim = 1; nod[nod[path].tracef].elim = 1;
328     }
329     if(nod[path].strand == -1 && nod[path].type == STOP &&
330        nod[nod[path].tracef].cscore + nod[nod[path].tracef].sscore < 0) {
331       nod[path].elim = 1; nod[nod[path].tracef].elim = 1;
332     }
333     path = nod[path].tracef;
334   }
335 }
336