1 static char rcsid[] = "$Id: changepoint.c 40271 2011-05-28 02:29:18Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5 
6 #include "changepoint.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h>		/* For sqrt */
10 
11 #define NPSEUDO 12.0
12 #define SLACK 0.10
13 
14 #ifdef DEBUG
15 #define debug(x) x
16 #else
17 #define debug(x)
18 #endif
19 
20 
21 /* Based on Chow test */
22 
23 
24 /* Goes from 3' to 5', looking for the last sharp decrease in matches */
25 int
Changepoint_left(int * nmatches_left,int * ntotal_left,int * matchscores,int length)26 Changepoint_left (int *nmatches_left, int *ntotal_left, int *matchscores, int length) {
27   int edge = 0;
28 
29   /* x signifies nmatches, y signifies nmismatches, x + y = n */
30   int pos, x, y, n, x_past, y_past, x_future, y_future, n_past, n_future;
31   double theta, x_pseudo, theta_past, theta_future, rss, rss_past, rss_future, rss_sep;
32   double min_rss_sep;
33 #ifdef DEBUG
34   double fscore;
35 #endif
36 
37   *nmatches_left = *ntotal_left = 0;
38 
39   x = y = 0;
40   for (pos = 0; pos < length; pos++) {
41     /* Ignore cases where matchscores == -1 */
42     if (matchscores[pos] == 1) {
43       x++;
44     } else if (matchscores[pos] == 0) {
45       y++;
46     }
47   }
48   n = x + y;
49 
50   /* when rss_sep == rss, fscore == 0 */
51   min_rss_sep = rss = (double) x * (double) y/(double) n;
52   if (rss == 0.0) {
53     return 0;
54   }
55 
56   /* Compute x_pseudo */
57   theta = (double) x/(double) n;
58   x_pseudo = NPSEUDO * theta;
59   debug(printf("%d %d %d %f\n",x,y,n,theta));
60 
61   x_past = y_past = n_past = 0;
62   x_future = x;
63   y_future = y;
64   n_future = n;
65 
66 
67   debug(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
68 		"pos","match","x.past","y.past","n.past","x.future","y.future","n.future",
69 		"theta.past","theta.future","rss.past","rss.future","fscore"));
70 
71   for (pos = length-1; pos > 0; --pos) {
72     if (matchscores[pos] >= 0) {
73       if (matchscores[pos] == 1) {
74 	x_past++;
75 	x_future--;
76       } else {
77 	y_past++;
78 	y_future--;
79       }
80       n_past++;
81       n_future--;
82 
83       theta_past = ((double) x_past + x_pseudo)/((double) n_past + NPSEUDO);
84       theta_future = ((double) x_future + x_pseudo)/((double) n_future + NPSEUDO);
85       rss_past = (double) x_past*(1.0-theta_past)*(1.0-theta_past) + (double) y_past*theta_past*theta_past;
86       rss_future = (double) x_future*(1.0-theta_future)*(1.0-theta_future) + (double) y_future*theta_future*theta_future;
87       rss_sep = rss_past + rss_future;
88 
89       if (rss_sep == 0.0) {
90 	debug(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
91 		     pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
92 		     theta_past,theta_future,rss_past,rss_future));
93 
94 #if 0
95       } else if (theta_future < theta_past) {
96 	debug(printf("Changepoint_start_change aborting early with edge=%d\n",edge));
97 	return edge;
98 #endif
99 
100       } else {
101 	debug(
102 	      fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
103 	      printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
104 		     pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
105 		     theta_past,theta_future,rss_past,rss_future,fscore);
106 	      );
107 
108 	/* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
109 	   1) is maximized when rss_sep is minimized */
110 
111 	if (theta_future < theta_past - SLACK) {
112 	  if (rss_sep < min_rss_sep) {
113 	    min_rss_sep = rss_sep;
114 	    edge = pos;
115 	    *nmatches_left = x_future;
116 	    *ntotal_left = n_future;
117 	    debug(printf("Set end_change to %d (nmatches %d, ntotal %d on left)\n",pos,*nmatches_left,*ntotal_left));
118 	    debug(printf("Set start_change to %d\n",pos));
119 	  }
120 	}
121       }
122     }
123   }
124 
125 
126 
127   debug(printf("Changepoint_left returning %d\n",edge));
128   return edge;
129 }
130 
131 
132 /* Goes from 5' to 3', looking for the last sharp decrease in matches */
133 int
Changepoint_right(int * nmatches_right,int * ntotal_right,int * matchscores,int length)134 Changepoint_right (int *nmatches_right, int *ntotal_right, int *matchscores, int length) {
135   int edge = length;
136 
137   /* x signifies nmatches, y signifies nmismatches, x + y = n */
138   int pos, x, y, n, x_past, y_past, x_future, y_future, n_past, n_future;
139   double theta, x_pseudo, theta_past, theta_future, rss, rss_past, rss_future, rss_sep;
140   double min_rss_sep;
141 #ifdef DEBUG
142   double fscore;
143 #endif
144 
145   *nmatches_right = *ntotal_right = 0;
146 
147   x = y = 0;
148   for (pos = 0; pos < length; pos++) {
149     /* Ignore cases where matchscores == -1 */
150     if (matchscores[pos] == 1) {
151       x++;
152     } else if (matchscores[pos] == 0) {
153       y++;
154     }
155 
156   }
157   n = x + y;
158 
159   /* when rss_sep == rss, fscore == 0 */
160   min_rss_sep = rss = (double) x * (double) y/(double) n;
161   if (rss == 0.0) {
162     return length;
163   }
164 
165   /* Compute x_pseudo */
166   theta = (double) x/(double) n;
167   x_pseudo = NPSEUDO * theta;
168   debug(printf("%d %d %d %f\n",x,y,n,theta));
169 
170   x_past = y_past = n_past = 0.0;
171   x_future = x;
172   y_future = y;
173   n_future = n;
174 
175   debug(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
176 		"pos","match","x.past","y.past","n.past","x.future","y.future","n.future",
177 		"theta.past","theta.future","rss.past","rss.future","fscore"));
178 
179   for (pos = 1; pos < length; pos++) {
180     if (matchscores[pos] >= 0) {
181       if (matchscores[pos] == 1) {
182 	x_past += 1.0;
183 	x_future -= 1.0;
184       } else {
185 	y_past += 1.0;
186 	y_future -= 1.0;
187       }
188       n_past += 1.0;
189       n_future -= 1.0;
190 
191       theta_past = ((double) x_past + x_pseudo)/((double) n_past + NPSEUDO);
192       theta_future = ((double) x_future + x_pseudo)/((double) n_future + NPSEUDO);
193       rss_past = (double) x_past*(1.0-theta_past)*(1.0-theta_past) + (double) y_past*theta_past*theta_past;
194       rss_future = (double) x_future*(1.0-theta_future)*(1.0-theta_future) + (double) y_future*theta_future*theta_future;
195       rss_sep = rss_past + rss_future;
196 
197       if (rss_sep == 0.0) {
198 	debug(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
199 		     pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
200 		     theta_past,theta_future,rss_past,rss_future));
201 
202 #if 0
203       } else if (theta_past < theta_future) {
204 	debug(printf("Changepoint_end_change aborting early with edge=%d\n",edge));
205 	return edge;
206 #endif
207 
208       } else {
209 	debug(
210 	      fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
211 	      printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
212 		     pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
213 		     theta_past,theta_future,rss_past,rss_future,fscore);
214 	      );
215 
216 	/* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
217 	   1) is maximized when rss_sep is minimized */
218 
219 	if (theta_future < theta_past - SLACK) {
220 	  if (rss_sep < min_rss_sep) {
221 	    min_rss_sep = rss_sep;
222 	    edge = pos;
223 	    *nmatches_right = x_future;
224 	    *ntotal_right = n_future;
225 	    debug(printf("Set end_change to %d (nmatches %d, ntotal %d on right)\n",pos,*nmatches_right,*ntotal_right));
226 	  }
227 	}
228       }
229     }
230   }
231 
232   debug(printf("Changepoint_right returning %d\n",edge));
233   return edge;
234 }
235 
236 
237 
238 #if 0
239 int
240 Changepoint_both_ends (int *newstart, int *newend, int *matchscores, int length) {
241   int breakpoint;
242   double pvalue = 1.0;
243   int *matchscores;
244 
245   /* x signifies nmatches, y signifies nmismatches, x + y = n */
246   int start, end, pos, x = 0, y, n, x_left, y_left, x_right, y_right, n_left, n_right;
247   double theta, x_pseudo, theta_left, theta_right, rss, rss_left, rss_right, rss_sep;
248   double fscore, min_rss_sep, best_pos = -1, best_theta_left, best_theta_right;
249 
250   /*
251   start = Sequence_trim_start(queryseq);
252   end = Sequence_trim_end(queryseq);
253   */
254 
255   start = 0;
256   end = length;
257 
258   x = y = 0;
259   for (pos = 0; pos < length; pos++) {
260     /* Ignore cases where value is -1 */
261     if (matchscores[pos] == 1) {
262       x++;
263     } else if (matchscores[pos] == 0) {
264       y++;
265     }
266   }
267   n = x + y;
268 
269   /* when rss_sep == rss, fscore == 0 */
270   min_rss_sep = rss = (double) x * (double) y/(double) n;
271   if (rss == 0.0) {
272     FREE(matchscores);
273     return 0;
274   }
275 
276   theta = (double) x/(double) n;
277   x_pseudo = NPSEUDO * theta;
278   debug(printf("%d %d %d %f\n",x,y,n,theta));
279 
280   x_left = y_left = n_left = 0;
281   x_right = x;
282   y_right = y;
283   n_right = n;
284 
285   debug(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
286 		"pos","match","x.left","y.left","n.left","x.right","y.right","n.right",
287 		"theta.left","theta.right","rss.left","rss.right","fscore"));
288 
289   for (pos = start; pos < end-1; pos++) {
290     if (matchscores[pos] == 1) {
291       x_left++;
292       x_right--;
293     } else if (matchscores[pos] == 0) {
294       y_left++;
295       y_right--;
296     } else {
297       abort();
298     }
299     n_left++;
300     n_right--;
301 
302     theta_left = ((double) x_left + x_pseudo)/((double) n_left + NPSEUDO);
303     theta_right = ((double) x_right + x_pseudo)/((double) n_right + NPSEUDO);
304     rss_left = x_left*(1.0-theta_left)*(1.0-theta_left) + y_left*theta_left*theta_left;
305     rss_right = x_right*(1.0-theta_right)*(1.0-theta_right) + y_right*theta_right*theta_right;
306     rss_sep = rss_left + rss_right;
307 
308     if (rss_sep == 0) {
309       debug(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
310 		    pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
311 		    theta_left,theta_right,rss_left,rss_right));
312     } else {
313       debug(
314 	     fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
315 	     printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
316 		    pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
317 		    theta_left,theta_right,rss_left,rss_right,fscore);
318 	     );
319 
320       /* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
321 	 1) is maximized when rss_sep is minimized */
322 
323       if (rss_sep < min_rss_sep) {
324 	min_rss_sep = rss_sep;
325 	best_pos = pos;
326 	best_theta_left = theta_left;
327 	best_theta_right = theta_right;
328       }
329     }
330   }
331   FREE(matchscores);
332 
333   fscore = ((double) (n - 2))*(rss - min_rss_sep)/min_rss_sep;
334   if (fscore < fthreshold) {
335     return 0;
336   } else {
337     breakpoint = best_pos;
338     debug(printf("at %d, fscore = %f\n",breakpoint,fscore));
339     if (best_theta_left < best_theta_right) {
340       /* trim left */
341       *newstart = breakpoint;
342       *newend = end;
343       return breakpoint - start;
344     } else {
345       /* trim right */
346       *newstart = start;
347       *newend = breakpoint;
348       return end - breakpoint;
349     }
350   }
351 }
352 
353 #endif
354