1 static char rcsid[] = "$Id: changepoint.c 40271 2011-05-28 02:29:18Z twu $";
2 #ifdef HAVE_CONFIG_H
3 #include <config.h>
4 #endif
5
6 #include "changepoint.h"
7 #include <stdio.h>
8 #include <stdlib.h>
9 #include <math.h> /* For sqrt */
10
11 #define NPSEUDO 12.0
12 #define SLACK 0.10
13
14 #ifdef DEBUG
15 #define debug(x) x
16 #else
17 #define debug(x)
18 #endif
19
20
21 /* Based on Chow test */
22
23
24 /* Goes from 3' to 5', looking for the last sharp decrease in matches */
25 int
Changepoint_left(int * nmatches_left,int * ntotal_left,int * matchscores,int length)26 Changepoint_left (int *nmatches_left, int *ntotal_left, int *matchscores, int length) {
27 int edge = 0;
28
29 /* x signifies nmatches, y signifies nmismatches, x + y = n */
30 int pos, x, y, n, x_past, y_past, x_future, y_future, n_past, n_future;
31 double theta, x_pseudo, theta_past, theta_future, rss, rss_past, rss_future, rss_sep;
32 double min_rss_sep;
33 #ifdef DEBUG
34 double fscore;
35 #endif
36
37 *nmatches_left = *ntotal_left = 0;
38
39 x = y = 0;
40 for (pos = 0; pos < length; pos++) {
41 /* Ignore cases where matchscores == -1 */
42 if (matchscores[pos] == 1) {
43 x++;
44 } else if (matchscores[pos] == 0) {
45 y++;
46 }
47 }
48 n = x + y;
49
50 /* when rss_sep == rss, fscore == 0 */
51 min_rss_sep = rss = (double) x * (double) y/(double) n;
52 if (rss == 0.0) {
53 return 0;
54 }
55
56 /* Compute x_pseudo */
57 theta = (double) x/(double) n;
58 x_pseudo = NPSEUDO * theta;
59 debug(printf("%d %d %d %f\n",x,y,n,theta));
60
61 x_past = y_past = n_past = 0;
62 x_future = x;
63 y_future = y;
64 n_future = n;
65
66
67 debug(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
68 "pos","match","x.past","y.past","n.past","x.future","y.future","n.future",
69 "theta.past","theta.future","rss.past","rss.future","fscore"));
70
71 for (pos = length-1; pos > 0; --pos) {
72 if (matchscores[pos] >= 0) {
73 if (matchscores[pos] == 1) {
74 x_past++;
75 x_future--;
76 } else {
77 y_past++;
78 y_future--;
79 }
80 n_past++;
81 n_future--;
82
83 theta_past = ((double) x_past + x_pseudo)/((double) n_past + NPSEUDO);
84 theta_future = ((double) x_future + x_pseudo)/((double) n_future + NPSEUDO);
85 rss_past = (double) x_past*(1.0-theta_past)*(1.0-theta_past) + (double) y_past*theta_past*theta_past;
86 rss_future = (double) x_future*(1.0-theta_future)*(1.0-theta_future) + (double) y_future*theta_future*theta_future;
87 rss_sep = rss_past + rss_future;
88
89 if (rss_sep == 0.0) {
90 debug(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
91 pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
92 theta_past,theta_future,rss_past,rss_future));
93
94 #if 0
95 } else if (theta_future < theta_past) {
96 debug(printf("Changepoint_start_change aborting early with edge=%d\n",edge));
97 return edge;
98 #endif
99
100 } else {
101 debug(
102 fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
103 printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
104 pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
105 theta_past,theta_future,rss_past,rss_future,fscore);
106 );
107
108 /* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
109 1) is maximized when rss_sep is minimized */
110
111 if (theta_future < theta_past - SLACK) {
112 if (rss_sep < min_rss_sep) {
113 min_rss_sep = rss_sep;
114 edge = pos;
115 *nmatches_left = x_future;
116 *ntotal_left = n_future;
117 debug(printf("Set end_change to %d (nmatches %d, ntotal %d on left)\n",pos,*nmatches_left,*ntotal_left));
118 debug(printf("Set start_change to %d\n",pos));
119 }
120 }
121 }
122 }
123 }
124
125
126
127 debug(printf("Changepoint_left returning %d\n",edge));
128 return edge;
129 }
130
131
132 /* Goes from 5' to 3', looking for the last sharp decrease in matches */
133 int
Changepoint_right(int * nmatches_right,int * ntotal_right,int * matchscores,int length)134 Changepoint_right (int *nmatches_right, int *ntotal_right, int *matchscores, int length) {
135 int edge = length;
136
137 /* x signifies nmatches, y signifies nmismatches, x + y = n */
138 int pos, x, y, n, x_past, y_past, x_future, y_future, n_past, n_future;
139 double theta, x_pseudo, theta_past, theta_future, rss, rss_past, rss_future, rss_sep;
140 double min_rss_sep;
141 #ifdef DEBUG
142 double fscore;
143 #endif
144
145 *nmatches_right = *ntotal_right = 0;
146
147 x = y = 0;
148 for (pos = 0; pos < length; pos++) {
149 /* Ignore cases where matchscores == -1 */
150 if (matchscores[pos] == 1) {
151 x++;
152 } else if (matchscores[pos] == 0) {
153 y++;
154 }
155
156 }
157 n = x + y;
158
159 /* when rss_sep == rss, fscore == 0 */
160 min_rss_sep = rss = (double) x * (double) y/(double) n;
161 if (rss == 0.0) {
162 return length;
163 }
164
165 /* Compute x_pseudo */
166 theta = (double) x/(double) n;
167 x_pseudo = NPSEUDO * theta;
168 debug(printf("%d %d %d %f\n",x,y,n,theta));
169
170 x_past = y_past = n_past = 0.0;
171 x_future = x;
172 y_future = y;
173 n_future = n;
174
175 debug(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
176 "pos","match","x.past","y.past","n.past","x.future","y.future","n.future",
177 "theta.past","theta.future","rss.past","rss.future","fscore"));
178
179 for (pos = 1; pos < length; pos++) {
180 if (matchscores[pos] >= 0) {
181 if (matchscores[pos] == 1) {
182 x_past += 1.0;
183 x_future -= 1.0;
184 } else {
185 y_past += 1.0;
186 y_future -= 1.0;
187 }
188 n_past += 1.0;
189 n_future -= 1.0;
190
191 theta_past = ((double) x_past + x_pseudo)/((double) n_past + NPSEUDO);
192 theta_future = ((double) x_future + x_pseudo)/((double) n_future + NPSEUDO);
193 rss_past = (double) x_past*(1.0-theta_past)*(1.0-theta_past) + (double) y_past*theta_past*theta_past;
194 rss_future = (double) x_future*(1.0-theta_future)*(1.0-theta_future) + (double) y_future*theta_future*theta_future;
195 rss_sep = rss_past + rss_future;
196
197 if (rss_sep == 0.0) {
198 debug(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
199 pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
200 theta_past,theta_future,rss_past,rss_future));
201
202 #if 0
203 } else if (theta_past < theta_future) {
204 debug(printf("Changepoint_end_change aborting early with edge=%d\n",edge));
205 return edge;
206 #endif
207
208 } else {
209 debug(
210 fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
211 printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
212 pos,matchscores[pos],x_past,y_past,n_past,x_future,y_future,n_future,
213 theta_past,theta_future,rss_past,rss_future,fscore);
214 );
215
216 /* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
217 1) is maximized when rss_sep is minimized */
218
219 if (theta_future < theta_past - SLACK) {
220 if (rss_sep < min_rss_sep) {
221 min_rss_sep = rss_sep;
222 edge = pos;
223 *nmatches_right = x_future;
224 *ntotal_right = n_future;
225 debug(printf("Set end_change to %d (nmatches %d, ntotal %d on right)\n",pos,*nmatches_right,*ntotal_right));
226 }
227 }
228 }
229 }
230 }
231
232 debug(printf("Changepoint_right returning %d\n",edge));
233 return edge;
234 }
235
236
237
238 #if 0
239 int
240 Changepoint_both_ends (int *newstart, int *newend, int *matchscores, int length) {
241 int breakpoint;
242 double pvalue = 1.0;
243 int *matchscores;
244
245 /* x signifies nmatches, y signifies nmismatches, x + y = n */
246 int start, end, pos, x = 0, y, n, x_left, y_left, x_right, y_right, n_left, n_right;
247 double theta, x_pseudo, theta_left, theta_right, rss, rss_left, rss_right, rss_sep;
248 double fscore, min_rss_sep, best_pos = -1, best_theta_left, best_theta_right;
249
250 /*
251 start = Sequence_trim_start(queryseq);
252 end = Sequence_trim_end(queryseq);
253 */
254
255 start = 0;
256 end = length;
257
258 x = y = 0;
259 for (pos = 0; pos < length; pos++) {
260 /* Ignore cases where value is -1 */
261 if (matchscores[pos] == 1) {
262 x++;
263 } else if (matchscores[pos] == 0) {
264 y++;
265 }
266 }
267 n = x + y;
268
269 /* when rss_sep == rss, fscore == 0 */
270 min_rss_sep = rss = (double) x * (double) y/(double) n;
271 if (rss == 0.0) {
272 FREE(matchscores);
273 return 0;
274 }
275
276 theta = (double) x/(double) n;
277 x_pseudo = NPSEUDO * theta;
278 debug(printf("%d %d %d %f\n",x,y,n,theta));
279
280 x_left = y_left = n_left = 0;
281 x_right = x;
282 y_right = y;
283 n_right = n;
284
285 debug(printf("%s %s %s %s %s %s %s %s %s %s %s %s %s\n",
286 "pos","match","x.left","y.left","n.left","x.right","y.right","n.right",
287 "theta.left","theta.right","rss.left","rss.right","fscore"));
288
289 for (pos = start; pos < end-1; pos++) {
290 if (matchscores[pos] == 1) {
291 x_left++;
292 x_right--;
293 } else if (matchscores[pos] == 0) {
294 y_left++;
295 y_right--;
296 } else {
297 abort();
298 }
299 n_left++;
300 n_right--;
301
302 theta_left = ((double) x_left + x_pseudo)/((double) n_left + NPSEUDO);
303 theta_right = ((double) x_right + x_pseudo)/((double) n_right + NPSEUDO);
304 rss_left = x_left*(1.0-theta_left)*(1.0-theta_left) + y_left*theta_left*theta_left;
305 rss_right = x_right*(1.0-theta_right)*(1.0-theta_right) + y_right*theta_right*theta_right;
306 rss_sep = rss_left + rss_right;
307
308 if (rss_sep == 0) {
309 debug(printf("%d %d %d %d %d %d %d %d %f %f %f %f NA\n",
310 pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
311 theta_left,theta_right,rss_left,rss_right));
312 } else {
313 debug(
314 fscore = ((double) (n - 2))*(rss - rss_sep)/rss_sep;
315 printf("%d %d %d %d %d %d %d %d %f %f %f %f %f\n",
316 pos,matchscores[pos],x_left,y_left,n_left,x_right,y_right,n_right,
317 theta_left,theta_right,rss_left,rss_right,fscore);
318 );
319
320 /* fscore = (n-2)*(rss - rss_sep)/rss_sep = (n-2)*(rss/rss_sep -
321 1) is maximized when rss_sep is minimized */
322
323 if (rss_sep < min_rss_sep) {
324 min_rss_sep = rss_sep;
325 best_pos = pos;
326 best_theta_left = theta_left;
327 best_theta_right = theta_right;
328 }
329 }
330 }
331 FREE(matchscores);
332
333 fscore = ((double) (n - 2))*(rss - min_rss_sep)/min_rss_sep;
334 if (fscore < fthreshold) {
335 return 0;
336 } else {
337 breakpoint = best_pos;
338 debug(printf("at %d, fscore = %f\n",breakpoint,fscore));
339 if (best_theta_left < best_theta_right) {
340 /* trim left */
341 *newstart = breakpoint;
342 *newend = end;
343 return breakpoint - start;
344 } else {
345 /* trim right */
346 *newstart = start;
347 *newend = breakpoint;
348 return end - breakpoint;
349 }
350 }
351 }
352
353 #endif
354