1 #ifdef HAVE_CONFIG_H
2 #include "config.h"
3 #endif
4 
5 #include <stdio.h>
6 #include <stdlib.h>
7 #include <math.h>
8 #include <ctype.h>
9 #include <string.h>
10 
11 #include <svm.h>
12 
13 #include "ViennaRNA/utils/basic.h"
14 #include "ViennaRNA/fold_vars.h"
15 #include "ViennaRNA/pair_mat.h"
16 #include "ViennaRNA/utils/svm.h"
17 
18 
19 #include "ViennaRNA/params/svm_model_avg.inc"  /* defines avg_model_string */
20 #include "ViennaRNA/params/svm_model_sd.inc"   /* defines sd_model_string */
21 
22 
23 PRIVATE struct svm_model  *avg_model;
24 PRIVATE struct svm_model  *sd_model;
25 
26 PRIVATE void
27 freeFields(char **fields);
28 
29 
30 PRIVATE char **
31 splitFields(char *string);
32 
33 
34 PRIVATE char **
35 splitLines(char *string);
36 
37 
38 PUBLIC float
get_z(char * sequence,double energy)39 get_z(char    *sequence,
40       double  energy)
41 {
42   double        average_free_energy;
43   double        sd_free_energy;
44   float         my_z = 0.;
45   int           info_avg;
46 
47   make_pair_matrix();
48   short         *S      = encode_sequence(sequence, 0);
49   unsigned int  length  = strlen(sequence);
50   int           *AUGC   = get_seq_composition(S, 1, length, length);
51   avg_model           = svm_load_model_string(avg_model_string);
52   sd_model            = svm_load_model_string(sd_model_string);
53   average_free_energy = avg_regression(AUGC[0],
54                                        AUGC[1],
55                                        AUGC[2],
56                                        AUGC[3],
57                                        AUGC[4],
58                                        avg_model,
59                                        &info_avg);
60 
61   if (info_avg == 0) {
62     double difference = (energy /* /100*/) - average_free_energy;
63     sd_free_energy  = sd_regression(AUGC[0], AUGC[1], AUGC[2], AUGC[3], AUGC[4], sd_model);
64     my_z            = difference / sd_free_energy;
65   } else {
66     vrna_message_warning("sequence out of bounds");
67 #if 0
68     my_z = shuffle_score(sequence, energy);
69 #endif
70   }
71 
72   free(AUGC);
73   free(S);
74   svm_free_model_content(avg_model);
75   svm_free_model_content(sd_model);
76   return my_z;
77 }
78 
79 
80 PUBLIC int *
get_seq_composition(short * S,unsigned int start,unsigned int stop,unsigned int length)81 get_seq_composition(short         *S,
82                     unsigned int  start,
83                     unsigned int  stop,
84                     unsigned int  length)
85 {
86   unsigned int  i;
87   int           *ret = (int *)vrna_alloc(sizeof(int) * 6);
88 
89   for (i = MAX2(start, 1); i <= MIN2(stop, length); i++) {
90     if (S[i] > 4)
91       ret[0]++;
92     else
93       ret[S[i]]++;
94   }
95   ret[5] = -1; /* indicate last entry */
96   return ret;
97 }
98 
99 
100 PUBLIC double
sd_regression(int N,int A,int C,int G,int T,struct svm_model * sd_model)101 sd_regression(int               N,
102               int               A,
103               int               C,
104               int               G,
105               int               T,
106               struct svm_model  *sd_model)
107 {
108   double          sd_free_energy  = 0.0;
109   int             length          = A + C + G + T + N;
110   double          GC_content      = (double)(G + C) / length;
111   double          AT_ratio        = (double)A / (A + T);
112   double          CG_ratio        = (double)C / (C + G);
113   double          norm_length     = (double)(length - 50) / 350.0;
114   struct svm_node node_mono[5];
115 
116   node_mono[0].index  = 1;
117   node_mono[0].value  = GC_content;
118   node_mono[1].index  = 2;
119   node_mono[1].value  = AT_ratio;
120   node_mono[2].index  = 3;
121   node_mono[2].value  = CG_ratio;
122   node_mono[3].index  = 4;
123   node_mono[3].value  = norm_length;
124   node_mono[4].index  = -1;
125 
126   sd_free_energy = svm_predict(sd_model, node_mono);
127 
128   sd_free_energy = (double)sd_free_energy * sqrt(length);
129 
130   return sd_free_energy;
131 }
132 
133 
134 PUBLIC double
avg_regression(int N,int A,int C,int G,int T,struct svm_model * avg_model,int * info)135 avg_regression(int              N,
136                int              A,
137                int              C,
138                int              G,
139                int              T,
140                struct svm_model *avg_model,
141                int              *info)
142 {
143   double          average_free_energy = 0.0;
144 
145   int             length      = A + C + G + T + N;
146   double          N_fraction  = (double)N / length;
147   double          GC_content  = (double)(G + C) / length;
148   double          AT_ratio    = (double)A / (A + T);
149   double          CG_ratio    = (double)C / (C + G);
150 
151   double          norm_length = (double)(length - 50) / 350.0;
152 
153   struct svm_node node_mono[5];
154 
155   *info = 0;
156   if (length < 50 || length > 400) {
157     *info = 1;
158     return 0.0;
159   }
160 
161   if (N_fraction > 0.05) {
162     *info = 2;
163     return 0.0;
164   }
165 
166   if (GC_content < 0.20 || GC_content > 0.80) {
167     *info = 3;
168     return 0.0;
169   }
170 
171   if (AT_ratio < 0.20 || AT_ratio > 0.80) {
172     *info = 4;
173     return 0.0;
174   }
175 
176   if (CG_ratio < 0.20 || CG_ratio > 0.80) {
177     *info = 5;
178     return 0.0;
179   }
180 
181   node_mono[0].index  = 1;
182   node_mono[0].value  = GC_content;
183   node_mono[1].index  = 2;
184   node_mono[1].value  = AT_ratio;
185   node_mono[2].index  = 3;
186   node_mono[2].value  = CG_ratio;
187   node_mono[3].index  = 4;
188   node_mono[3].value  = norm_length;
189   node_mono[4].index  = -1;
190 
191   average_free_energy = svm_predict(avg_model, node_mono);
192 
193   average_free_energy = (double)average_free_energy * length;
194 
195   return average_free_energy;
196 }
197 
198 
199 PUBLIC double
minimal_sd(int N,int A,int C,int G,int T)200 minimal_sd(int  N,
201            int  A,
202            int  C,
203            int  G,
204            int  T)
205 {
206   int length = A + C + G + T + N;
207 
208   if (length < 60)
209     return 0.450324;
210 
211   if (length < 70)
212     return 0.749771;
213 
214   if (length < 80)
215     return 1.029421;
216 
217   if (length < 90)
218     return 1.027517;
219 
220   if (length < 100)
221     return 1.347283;
222 
223   if (length < 120)
224     return 1.112086;
225 
226   if (length < 150)
227     return 1.574339;
228 
229   if (length < 170)
230     return 1.779043;
231 
232   if (length < 200)
233     return 1.922908;
234 
235   if (length < 250)
236     return 2.226856;
237 
238   if (length < 300)
239     return 2.349300;
240 
241   if (length < 350)
242     return 2.589703;
243 
244   if (length < 400)
245     return 2.791215;
246 
247   return 0.450324;
248 }
249 
250 
251 PUBLIC struct svm_model *
svm_load_model_string(char * modelString)252 svm_load_model_string(char *modelString)
253 {
254   /* redefinition from svm.cpp */
255   char              *svm_type_table[] = {
256     "c_svc", "nu_svc", "one_class", "epsilon_svr", "nu_svr", NULL
257   };
258   char              *kernel_type_table[] = {
259     "linear", "polynomial", "rbf", "sigmoid", NULL
260   };
261 
262   struct svm_model  *model;
263   char              **lines, **fields;
264   int               i, j, k, l, m;
265   char              *key, *value, *field;
266   char              c;
267   int               dataStart, elements;
268   struct svm_node   *x_space = NULL;
269 
270   model = (struct svm_model *)vrna_alloc(sizeof(struct svm_model));
271 
272   model->rho    = NULL;
273   model->probA  = NULL;
274   model->probB  = NULL;
275   model->label  = NULL;
276   model->nSV    = NULL;
277 
278 
279   /* Read header until support vectors start */
280   lines = splitLines(modelString);
281   i     = 0;
282   while (lines[i] && (strcmp(lines[i], "SV") != 0)) {
283     fields = splitFields(lines[i]);
284 
285     key = fields[0];
286 
287     if (strcmp(key, "svm_type") == 0) {
288       value = fields[1];
289       for (j = 0; svm_type_table[j]; j++) {
290         if (strcmp(svm_type_table[j], value) == 0) {
291           model->param.svm_type = j;
292           break;
293         }
294       }
295       if (svm_type_table[i] == NULL) {
296         vrna_message_warning("unknown svm type.");
297         free(model->rho);
298         free(model->label);
299         free(model->nSV);
300         free(model);
301         return NULL;
302       }
303     } else
304 
305     if (strcmp(key, "kernel_type") == 0) {
306       value = fields[1];
307       for (j = 0; kernel_type_table[j]; j++) {
308         if (strcmp(kernel_type_table[j], value) == 0) {
309           model->param.kernel_type = j;
310           break;
311         }
312       }
313       if (kernel_type_table[i] == NULL) {
314         vrna_message_warning("unknown kernel type.");
315         free(model->rho);
316         free(model->label);
317         free(model->nSV);
318         free(model);
319         return NULL;
320       }
321     } else
322 
323     if (strcmp(key, "gamma") == 0) {
324       value = fields[1];
325       sscanf(value, "%lf", &model->param.gamma);
326     }
327 
328     if (strcmp(key, "degree") == 0) {
329       value = fields[1];
330       sscanf(value, "%d", &model->param.degree);
331     } else
332 
333     if (strcmp(key, "coef0") == 0) {
334       value = fields[1];
335       sscanf(value, "%lf", &model->param.coef0);
336     } else
337     if (strcmp(key, "nr_class") == 0) {
338       value = fields[1];
339       sscanf(value, "%d", &model->nr_class);
340     } else
341     if (strcmp(key, "total_sv") == 0) {
342       value = fields[1];
343       sscanf(value, "%d", &model->l);
344     } else
345 
346     if (strcmp(key, "rho") == 0) {
347       int n = model->nr_class * (model->nr_class - 1) / 2;
348       model->rho = (double *)vrna_alloc(sizeof(double) * n);
349       for (j = 0; j < n; j++)
350         sscanf(fields[j + 1], "%lf", &model->rho[j]);
351     } else
352 
353     if (strcmp(key, "nr_sv") == 0) {
354       int n = model->nr_class;
355       model->nSV = (int *)vrna_alloc(sizeof(int) * n);
356       for (j = 0; j < n; j++)
357         sscanf(fields[j + 1], "%d", &model->nSV[j]);
358     } else
359 
360     if (strcmp(key, "label") == 0) {
361       int n = model->nr_class;
362       model->label = (int *)vrna_alloc(sizeof(int) * n);
363       for (j = 0; j < n; j++)
364         sscanf(fields[j + 1], "%d", &model->label[j]);
365     } else
366 
367     if (strcmp(key, "probA") == 0) {
368       int n = model->nr_class * (model->nr_class - 1) / 2;
369       model->probA = (double *)vrna_alloc(sizeof(double) * n);
370       for (j = 0; j < n; j++)
371         sscanf(fields[j + 1], "%lf", &model->probA[j]);
372     } else
373 
374     if (strcmp(key, "probB") == 0) {
375       int n = model->nr_class * (model->nr_class - 1) / 2;
376       model->probB = (double *)vrna_alloc(sizeof(double) * n);
377       for (j = 0; j < n; j++)
378         sscanf(fields[j + 1], "%lf", &model->probB[j]);
379     }
380 
381     i++;
382     freeFields(fields);
383   }
384 
385   dataStart = i + 1;
386   elements  = 0;
387 
388   /* Count number of nodes (by counting colons) in advance to allocate
389    *     memory in one block */
390   while (lines[i] != NULL) {
391     j = 0;
392     while ((c = lines[i][j]) != '\0') {
393       if (c == ':')
394         elements++;
395 
396       j++;
397     }
398     elements++;
399     i++;
400   }
401 
402   /* allocate memory for SVs and coefficients */
403   m               = model->nr_class - 1;
404   l               = model->l;
405   model->sv_coef  = (double **)vrna_alloc(sizeof(double *) * m);
406   for (i = 0; i < m; i++)
407     model->sv_coef[i] = (double *)vrna_alloc(sizeof(double) * l);
408   model->SV = (struct svm_node **)vrna_alloc(sizeof(struct svm_node *) * l);
409 
410   if (l > 0)
411     x_space = (struct svm_node *)vrna_alloc(sizeof(struct svm_node) * (elements));
412 
413   /* parse support vector data */
414   j = 0;
415   for (i = 0; i < l; i++) {
416     fields        = splitFields(lines[dataStart + i]);
417     model->SV[i]  = &x_space[j];
418     k             = 0;
419     while ((field = fields[k]) != NULL) {
420       if (k < m) {
421         sscanf(fields[k], "%lf", &model->sv_coef[k][i]);
422       } else {
423         sscanf(fields[k], "%d:%lf", &(x_space[j].index), &(x_space[j].value));
424         j++;
425       }
426 
427       k++;
428     }
429     x_space[j++].index = -1;
430     freeFields(fields);
431   }
432   freeFields(lines);
433 
434   model->free_sv = 1;
435 
436   return model;
437 }
438 
439 
440 PRIVATE char **
splitFields(char * string)441 splitFields(char *string)
442 {
443   char  c;
444   char  *currField;
445   char  **output = NULL;
446   int   *seps;
447   int   nSep;
448   int   nField  = 0;
449   int   i       = 0;
450 
451   if (strlen(string) == 0 || string == NULL)
452     return NULL;
453 
454   /* First find all characters which are whitespaces and store the
455    *     positions in the array seps */
456 
457   seps    = (int *)vrna_alloc(sizeof(int));
458   seps[0] = -1;
459   nSep    = 1;
460 
461   while ((c = string[i]) != '\0' && (c != '\n')) {
462     if (isspace(c)) {
463       seps          = (int *)vrna_realloc(seps, sizeof(int) * (nSep + 1));
464       seps[nSep++]  = i;
465     }
466 
467     i++;
468   }
469 
470   seps        = (int *)vrna_realloc(seps, sizeof(int) * (nSep + 1));
471   seps[nSep]  = strlen(string);
472 
473 
474   /* Then go through all intervals in between of two whitespaces (or
475    *     end or start of string) and store the fields in the array
476    *     "output"; if there are two adjacent whitespaces this is ignored
477    *     resulting in a behaviour like "split /\s+/" in perl */
478 
479   for (i = 0; i < nSep; i++) {
480     int start = seps[i];
481     int stop = seps[i + 1];
482     int length = (stop - start);
483     int notSpace, j;
484 
485 
486     currField = (char *)vrna_alloc(sizeof(char) * (length + 1));
487     strncpy(currField, string + start + 1, length - 1);
488     currField[length] = '\0';
489 
490     /* check if field is not only whitespace */
491     notSpace  = 0;
492     j         = 0;
493     while ((c = currField[j]) != '\0') {
494       if (!isspace(c)) {
495         notSpace = 1;
496         break;
497       }
498     }
499 
500     if (notSpace) {
501       output            = (char **)vrna_realloc(output, sizeof(char **) * (nField + 1));
502       output[nField++]  = currField;
503       currField         = NULL;
504     } else {
505       free(currField);
506       currField = NULL;
507     }
508 
509     /* printf("%s|\n",output[nField-1]); */
510   }
511 
512   if (nField == 0)
513     return NULL;
514 
515   output          = (char **)vrna_realloc(output, sizeof(char **) * (nField + 1));
516   output[nField]  = NULL;
517 
518   free(seps);
519   return output;
520 }
521 
522 
523 PRIVATE char **
splitLines(char * string)524 splitLines(char *string)
525 {
526   char  c;
527   char  *currLine   = NULL;
528   char  **output    = NULL;
529   int   i           = 0;
530   int   currLength  = 0;
531   int   lineN       = 0;
532 
533   while ((c = string[i]) != '\0') {
534     if (c == '\n') {
535       output                = (char **)vrna_realloc(output, sizeof(char **) * (lineN + 1));
536       currLine              = (char *)vrna_realloc(currLine, sizeof(char) * (currLength + 1));
537       currLine[currLength]  = '\0';
538       output[lineN]         = currLine;
539       currLength            = 0;
540       currLine              = NULL;
541       lineN++;
542     } else {
543       currLine              = (char *)vrna_realloc(currLine, sizeof(char) * (currLength + 1));
544       currLine[currLength]  = c;
545       currLength++;
546     }
547 
548     i++;
549   }
550 
551   output        = (char **)vrna_realloc(output, sizeof(char **) * (lineN + 1));
552   output[lineN] = NULL;
553 
554   return output;
555 }
556 
557 
558 /*  for both splitLines and splitFields */
559 void
freeFields(char ** fields)560 freeFields(char **fields)
561 {
562   int i = 0;
563 
564   while (fields[i] != NULL)
565     free(fields[i++]);
566   free(fields);
567 }
568