1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif
4
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <math.h>
8 #include <ctype.h>
9 #include <string.h>
10
11 #include <svm.h>
12
13 #include "ViennaRNA/utils/basic.h"
14 #include "ViennaRNA/fold_vars.h"
15 #include "ViennaRNA/pair_mat.h"
16 #include "ViennaRNA/utils/svm.h"
17
18
19 #include "ViennaRNA/params/svm_model_avg.inc" /* defines avg_model_string */
20 #include "ViennaRNA/params/svm_model_sd.inc" /* defines sd_model_string */
21
22
23 PRIVATE struct svm_model *avg_model;
24 PRIVATE struct svm_model *sd_model;
25
26 PRIVATE void
27 freeFields(char **fields);
28
29
30 PRIVATE char **
31 splitFields(char *string);
32
33
34 PRIVATE char **
35 splitLines(char *string);
36
37
38 PUBLIC float
get_z(char * sequence,double energy)39 get_z(char *sequence,
40 double energy)
41 {
42 double average_free_energy;
43 double sd_free_energy;
44 float my_z = 0.;
45 int info_avg;
46
47 make_pair_matrix();
48 short *S = encode_sequence(sequence, 0);
49 unsigned int length = strlen(sequence);
50 int *AUGC = get_seq_composition(S, 1, length, length);
51 avg_model = svm_load_model_string(avg_model_string);
52 sd_model = svm_load_model_string(sd_model_string);
53 average_free_energy = avg_regression(AUGC[0],
54 AUGC[1],
55 AUGC[2],
56 AUGC[3],
57 AUGC[4],
58 avg_model,
59 &info_avg);
60
61 if (info_avg == 0) {
62 double difference = (energy /* /100*/) - average_free_energy;
63 sd_free_energy = sd_regression(AUGC[0], AUGC[1], AUGC[2], AUGC[3], AUGC[4], sd_model);
64 my_z = difference / sd_free_energy;
65 } else {
66 vrna_message_warning("sequence out of bounds");
67 #if 0
68 my_z = shuffle_score(sequence, energy);
69 #endif
70 }
71
72 free(AUGC);
73 free(S);
74 svm_free_model_content(avg_model);
75 svm_free_model_content(sd_model);
76 return my_z;
77 }
78
79
80 PUBLIC int *
get_seq_composition(short * S,unsigned int start,unsigned int stop,unsigned int length)81 get_seq_composition(short *S,
82 unsigned int start,
83 unsigned int stop,
84 unsigned int length)
85 {
86 unsigned int i;
87 int *ret = (int *)vrna_alloc(sizeof(int) * 6);
88
89 for (i = MAX2(start, 1); i <= MIN2(stop, length); i++) {
90 if (S[i] > 4)
91 ret[0]++;
92 else
93 ret[S[i]]++;
94 }
95 ret[5] = -1; /* indicate last entry */
96 return ret;
97 }
98
99
100 PUBLIC double
sd_regression(int N,int A,int C,int G,int T,struct svm_model * sd_model)101 sd_regression(int N,
102 int A,
103 int C,
104 int G,
105 int T,
106 struct svm_model *sd_model)
107 {
108 double sd_free_energy = 0.0;
109 int length = A + C + G + T + N;
110 double GC_content = (double)(G + C) / length;
111 double AT_ratio = (double)A / (A + T);
112 double CG_ratio = (double)C / (C + G);
113 double norm_length = (double)(length - 50) / 350.0;
114 struct svm_node node_mono[5];
115
116 node_mono[0].index = 1;
117 node_mono[0].value = GC_content;
118 node_mono[1].index = 2;
119 node_mono[1].value = AT_ratio;
120 node_mono[2].index = 3;
121 node_mono[2].value = CG_ratio;
122 node_mono[3].index = 4;
123 node_mono[3].value = norm_length;
124 node_mono[4].index = -1;
125
126 sd_free_energy = svm_predict(sd_model, node_mono);
127
128 sd_free_energy = (double)sd_free_energy * sqrt(length);
129
130 return sd_free_energy;
131 }
132
133
134 PUBLIC double
avg_regression(int N,int A,int C,int G,int T,struct svm_model * avg_model,int * info)135 avg_regression(int N,
136 int A,
137 int C,
138 int G,
139 int T,
140 struct svm_model *avg_model,
141 int *info)
142 {
143 double average_free_energy = 0.0;
144
145 int length = A + C + G + T + N;
146 double N_fraction = (double)N / length;
147 double GC_content = (double)(G + C) / length;
148 double AT_ratio = (double)A / (A + T);
149 double CG_ratio = (double)C / (C + G);
150
151 double norm_length = (double)(length - 50) / 350.0;
152
153 struct svm_node node_mono[5];
154
155 *info = 0;
156 if (length < 50 || length > 400) {
157 *info = 1;
158 return 0.0;
159 }
160
161 if (N_fraction > 0.05) {
162 *info = 2;
163 return 0.0;
164 }
165
166 if (GC_content < 0.20 || GC_content > 0.80) {
167 *info = 3;
168 return 0.0;
169 }
170
171 if (AT_ratio < 0.20 || AT_ratio > 0.80) {
172 *info = 4;
173 return 0.0;
174 }
175
176 if (CG_ratio < 0.20 || CG_ratio > 0.80) {
177 *info = 5;
178 return 0.0;
179 }
180
181 node_mono[0].index = 1;
182 node_mono[0].value = GC_content;
183 node_mono[1].index = 2;
184 node_mono[1].value = AT_ratio;
185 node_mono[2].index = 3;
186 node_mono[2].value = CG_ratio;
187 node_mono[3].index = 4;
188 node_mono[3].value = norm_length;
189 node_mono[4].index = -1;
190
191 average_free_energy = svm_predict(avg_model, node_mono);
192
193 average_free_energy = (double)average_free_energy * length;
194
195 return average_free_energy;
196 }
197
198
199 PUBLIC double
minimal_sd(int N,int A,int C,int G,int T)200 minimal_sd(int N,
201 int A,
202 int C,
203 int G,
204 int T)
205 {
206 int length = A + C + G + T + N;
207
208 if (length < 60)
209 return 0.450324;
210
211 if (length < 70)
212 return 0.749771;
213
214 if (length < 80)
215 return 1.029421;
216
217 if (length < 90)
218 return 1.027517;
219
220 if (length < 100)
221 return 1.347283;
222
223 if (length < 120)
224 return 1.112086;
225
226 if (length < 150)
227 return 1.574339;
228
229 if (length < 170)
230 return 1.779043;
231
232 if (length < 200)
233 return 1.922908;
234
235 if (length < 250)
236 return 2.226856;
237
238 if (length < 300)
239 return 2.349300;
240
241 if (length < 350)
242 return 2.589703;
243
244 if (length < 400)
245 return 2.791215;
246
247 return 0.450324;
248 }
249
250
251 PUBLIC struct svm_model *
svm_load_model_string(char * modelString)252 svm_load_model_string(char *modelString)
253 {
254 /* redefinition from svm.cpp */
255 char *svm_type_table[] = {
256 "c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr", NULL
257 };
258 char *kernel_type_table[] = {
259 "linear", "polynomial", "rbf", "sigmoid", NULL
260 };
261
262 struct svm_model *model;
263 char **lines, **fields;
264 int i, j, k, l, m;
265 char *key, *value, *field;
266 char c;
267 int dataStart, elements;
268 struct svm_node *x_space = NULL;
269
270 model = (struct svm_model *)vrna_alloc(sizeof(struct svm_model));
271
272 model->rho = NULL;
273 model->probA = NULL;
274 model->probB = NULL;
275 model->label = NULL;
276 model->nSV = NULL;
277
278
279 /* Read header until support vectors start */
280 lines = splitLines(modelString);
281 i = 0;
282 while (lines[i] && (strcmp(lines[i], "SV") != 0)) {
283 fields = splitFields(lines[i]);
284
285 key = fields[0];
286
287 if (strcmp(key, "svm_type") == 0) {
288 value = fields[1];
289 for (j = 0; svm_type_table[j]; j++) {
290 if (strcmp(svm_type_table[j], value) == 0) {
291 model->param.svm_type = j;
292 break;
293 }
294 }
295 if (svm_type_table[i] == NULL) {
296 vrna_message_warning("unknown svm type.");
297 free(model->rho);
298 free(model->label);
299 free(model->nSV);
300 free(model);
301 return NULL;
302 }
303 } else
304
305 if (strcmp(key, "kernel_type") == 0) {
306 value = fields[1];
307 for (j = 0; kernel_type_table[j]; j++) {
308 if (strcmp(kernel_type_table[j], value) == 0) {
309 model->param.kernel_type = j;
310 break;
311 }
312 }
313 if (kernel_type_table[i] == NULL) {
314 vrna_message_warning("unknown kernel type.");
315 free(model->rho);
316 free(model->label);
317 free(model->nSV);
318 free(model);
319 return NULL;
320 }
321 } else
322
323 if (strcmp(key, "gamma") == 0) {
324 value = fields[1];
325 sscanf(value, "%lf", &model->param.gamma);
326 }
327
328 if (strcmp(key, "degree") == 0) {
329 value = fields[1];
330 sscanf(value, "%d", &model->param.degree);
331 } else
332
333 if (strcmp(key, "coef0") == 0) {
334 value = fields[1];
335 sscanf(value, "%lf", &model->param.coef0);
336 } else
337 if (strcmp(key, "nr_class") == 0) {
338 value = fields[1];
339 sscanf(value, "%d", &model->nr_class);
340 } else
341 if (strcmp(key, "total_sv") == 0) {
342 value = fields[1];
343 sscanf(value, "%d", &model->l);
344 } else
345
346 if (strcmp(key, "rho") == 0) {
347 int n = model->nr_class * (model->nr_class - 1) / 2;
348 model->rho = (double *)vrna_alloc(sizeof(double) * n);
349 for (j = 0; j < n; j++)
350 sscanf(fields[j + 1], "%lf", &model->rho[j]);
351 } else
352
353 if (strcmp(key, "nr_sv") == 0) {
354 int n = model->nr_class;
355 model->nSV = (int *)vrna_alloc(sizeof(int) * n);
356 for (j = 0; j < n; j++)
357 sscanf(fields[j + 1], "%d", &model->nSV[j]);
358 } else
359
360 if (strcmp(key, "label") == 0) {
361 int n = model->nr_class;
362 model->label = (int *)vrna_alloc(sizeof(int) * n);
363 for (j = 0; j < n; j++)
364 sscanf(fields[j + 1], "%d", &model->label[j]);
365 } else
366
367 if (strcmp(key, "probA") == 0) {
368 int n = model->nr_class * (model->nr_class - 1) / 2;
369 model->probA = (double *)vrna_alloc(sizeof(double) * n);
370 for (j = 0; j < n; j++)
371 sscanf(fields[j + 1], "%lf", &model->probA[j]);
372 } else
373
374 if (strcmp(key, "probB") == 0) {
375 int n = model->nr_class * (model->nr_class - 1) / 2;
376 model->probB = (double *)vrna_alloc(sizeof(double) * n);
377 for (j = 0; j < n; j++)
378 sscanf(fields[j + 1], "%lf", &model->probB[j]);
379 }
380
381 i++;
382 freeFields(fields);
383 }
384
385 dataStart = i + 1;
386 elements = 0;
387
388 /* Count number of nodes (by counting colons) in advance to allocate
389 * memory in one block */
390 while (lines[i] != NULL) {
391 j = 0;
392 while ((c = lines[i][j]) != '\0') {
393 if (c == ':')
394 elements++;
395
396 j++;
397 }
398 elements++;
399 i++;
400 }
401
402 /* allocate memory for SVs and coefficients */
403 m = model->nr_class - 1;
404 l = model->l;
405 model->sv_coef = (double **)vrna_alloc(sizeof(double *) * m);
406 for (i = 0; i < m; i++)
407 model->sv_coef[i] = (double *)vrna_alloc(sizeof(double) * l);
408 model->SV = (struct svm_node **)vrna_alloc(sizeof(struct svm_node *) * l);
409
410 if (l > 0)
411 x_space = (struct svm_node *)vrna_alloc(sizeof(struct svm_node) * (elements));
412
413 /* parse support vector data */
414 j = 0;
415 for (i = 0; i < l; i++) {
416 fields = splitFields(lines[dataStart + i]);
417 model->SV[i] = &x_space[j];
418 k = 0;
419 while ((field = fields[k]) != NULL) {
420 if (k < m) {
421 sscanf(fields[k], "%lf", &model->sv_coef[k][i]);
422 } else {
423 sscanf(fields[k], "%d:%lf", &(x_space[j].index), &(x_space[j].value));
424 j++;
425 }
426
427 k++;
428 }
429 x_space[j++].index = -1;
430 freeFields(fields);
431 }
432 freeFields(lines);
433
434 model->free_sv = 1;
435
436 return model;
437 }
438
439
440 PRIVATE char **
splitFields(char * string)441 splitFields(char *string)
442 {
443 char c;
444 char *currField;
445 char **output = NULL;
446 int *seps;
447 int nSep;
448 int nField = 0;
449 int i = 0;
450
451 if (strlen(string) == 0 || string == NULL)
452 return NULL;
453
454 /* First find all characters which are whitespaces and store the
455 * positions in the array seps */
456
457 seps = (int *)vrna_alloc(sizeof(int));
458 seps[0] = -1;
459 nSep = 1;
460
461 while ((c = string[i]) != '\0' && (c != '\n')) {
462 if (isspace(c)) {
463 seps = (int *)vrna_realloc(seps, sizeof(int) * (nSep + 1));
464 seps[nSep++] = i;
465 }
466
467 i++;
468 }
469
470 seps = (int *)vrna_realloc(seps, sizeof(int) * (nSep + 1));
471 seps[nSep] = strlen(string);
472
473
474 /* Then go through all intervals in between of two whitespaces (or
475 * end or start of string) and store the fields in the array
476 * "output"; if there are two adjacent whitespaces this is ignored
477 * resulting in a behaviour like "split /\s+/" in perl */
478
479 for (i = 0; i < nSep; i++) {
480 int start = seps[i];
481 int stop = seps[i + 1];
482 int length = (stop - start);
483 int notSpace, j;
484
485
486 currField = (char *)vrna_alloc(sizeof(char) * (length + 1));
487 strncpy(currField, string + start + 1, length - 1);
488 currField[length] = '\0';
489
490 /* check if field is not only whitespace */
491 notSpace = 0;
492 j = 0;
493 while ((c = currField[j]) != '\0') {
494 if (!isspace(c)) {
495 notSpace = 1;
496 break;
497 }
498 }
499
500 if (notSpace) {
501 output = (char **)vrna_realloc(output, sizeof(char **) * (nField + 1));
502 output[nField++] = currField;
503 currField = NULL;
504 } else {
505 free(currField);
506 currField = NULL;
507 }
508
509 /* printf("%s|\n",output[nField-1]); */
510 }
511
512 if (nField == 0)
513 return NULL;
514
515 output = (char **)vrna_realloc(output, sizeof(char **) * (nField + 1));
516 output[nField] = NULL;
517
518 free(seps);
519 return output;
520 }
521
522
523 PRIVATE char **
splitLines(char * string)524 splitLines(char *string)
525 {
526 char c;
527 char *currLine = NULL;
528 char **output = NULL;
529 int i = 0;
530 int currLength = 0;
531 int lineN = 0;
532
533 while ((c = string[i]) != '\0') {
534 if (c == '\n') {
535 output = (char **)vrna_realloc(output, sizeof(char **) * (lineN + 1));
536 currLine = (char *)vrna_realloc(currLine, sizeof(char) * (currLength + 1));
537 currLine[currLength] = '\0';
538 output[lineN] = currLine;
539 currLength = 0;
540 currLine = NULL;
541 lineN++;
542 } else {
543 currLine = (char *)vrna_realloc(currLine, sizeof(char) * (currLength + 1));
544 currLine[currLength] = c;
545 currLength++;
546 }
547
548 i++;
549 }
550
551 output = (char **)vrna_realloc(output, sizeof(char **) * (lineN + 1));
552 output[lineN] = NULL;
553
554 return output;
555 }
556
557
558 /* for both splitLines and splitFields */
559 void
freeFields(char ** fields)560 freeFields(char **fields)
561 {
562 int i = 0;
563
564 while (fields[i] != NULL)
565 free(fields[i++]);
566 free(fields);
567 }
568