1 /* SHAPE reactivity data handling */
2 
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
6 
7 #include <assert.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <math.h>
11 #include <ctype.h>
12 #include <string.h>
13 #include <limits.h>
14 
15 #include "ViennaRNA/params/default.h"
16 #include "ViennaRNA/params/constants.h" /* defines MINPSCORE */
17 #include "ViennaRNA/fold_vars.h"
18 #include "ViennaRNA/utils/basic.h"
19 #include "ViennaRNA/utils/strings.h"
20 #include "ViennaRNA/utils/alignments.h"
21 #include "ViennaRNA/io/utils.h"
22 #include "ViennaRNA/io/file_formats.h"
23 #include "ViennaRNA/params/basic.h"
24 #include "ViennaRNA/constraints/soft.h"
25 #include "ViennaRNA/constraints/SHAPE.h"
26 
27 /*
28  #################################
29  # GLOBAL VARIABLES              #
30  #################################
31  */
32 
33 /*
34  #################################
35  # PRIVATE VARIABLES             #
36  #################################
37  */
38 
39 /*
40  #################################
41  # PRIVATE FUNCTION DECLARATIONS #
42  #################################
43  */
44 PRIVATE void
45 sc_parse_parameters(const char  *string,
46                     char        c1,
47                     char        c2,
48                     float       *v1,
49                     float       *v2);
50 
51 
52 PRIVATE FLT_OR_DBL
53 conversion_deigan(double reactivity,
54                   double m,
55                   double b);
56 
57 
58 /*
59  #################################
60  # BEGIN OF FUNCTION DEFINITIONS #
61  #################################
62  */
63 PUBLIC void
vrna_constraints_add_SHAPE(vrna_fold_compound_t * vc,const char * shape_file,const char * shape_method,const char * shape_conversion,int verbose,unsigned int constraint_type)64 vrna_constraints_add_SHAPE(vrna_fold_compound_t *vc,
65                            const char           *shape_file,
66                            const char           *shape_method,
67                            const char           *shape_conversion,
68                            int                  verbose,
69                            unsigned int         constraint_type)
70 {
71   float   p1, p2;
72   char    method;
73   char    *sequence;
74   double  *values;
75   int     i, length = vc->length;
76 
77   if (!vrna_sc_SHAPE_parse_method(shape_method, &method, &p1, &p2)) {
78     vrna_message_warning("Method for SHAPE reactivity data conversion not recognized!");
79     return;
80   }
81 
82   if (verbose) {
83     if (method != 'W') {
84       if (method == 'Z') {
85         vrna_message_info(stderr, "Using SHAPE method '%c' with parameter p1=%f", method, p1);
86       } else {
87         vrna_message_info(stderr,
88                           "Using SHAPE method '%c' with parameters p1=%f and p2=%f",
89                           method,
90                           p1,
91                           p2);
92       }
93     }
94   }
95 
96   sequence  = vrna_alloc(sizeof(char) * (length + 1));
97   values    = vrna_alloc(sizeof(double) * (length + 1));
98   vrna_file_SHAPE_read(shape_file, length, method == 'W' ? 0 : -1, sequence, values);
99 
100   if (method == 'D') {
101     (void)vrna_sc_add_SHAPE_deigan(vc, (const double *)values, p1, p2, constraint_type);
102   } else if (method == 'Z') {
103     (void)vrna_sc_add_SHAPE_zarringhalam(vc,
104                                          (const double *)values,
105                                          p1,
106                                          0.5,
107                                          shape_conversion,
108                                          constraint_type);
109   } else {
110     assert(method == 'W');
111     FLT_OR_DBL *v = vrna_alloc(sizeof(FLT_OR_DBL) * (length + 1));
112     for (i = 0; i < length; i++)
113       v[i] = values[i];
114 
115     vrna_sc_set_up(vc, v, constraint_type);
116 
117     free(v);
118   }
119 
120   free(values);
121   free(sequence);
122 }
123 
124 
125 PUBLIC void
vrna_constraints_add_SHAPE_ali(vrna_fold_compound_t * vc,const char * shape_method,const char ** shape_files,const int * shape_file_association,int verbose,unsigned int constraint_type)126 vrna_constraints_add_SHAPE_ali(vrna_fold_compound_t *vc,
127                                const char           *shape_method,
128                                const char           **shape_files,
129                                const int            *shape_file_association,
130                                int                  verbose,
131                                unsigned int         constraint_type)
132 {
133   float p1, p2;
134   char  method;
135 
136   if (!vrna_sc_SHAPE_parse_method(shape_method, &method, &p1, &p2)) {
137     vrna_message_warning("Method for SHAPE reactivity data conversion not recognized!");
138     return;
139   }
140 
141   if (method != 'D') {
142     vrna_message_warning("SHAPE method %c not implemented for comparative prediction!",
143                          method);
144     vrna_message_warning("Ignoring SHAPE reactivity data!");
145     return;
146   } else {
147     if (verbose) {
148       vrna_message_info(stderr,
149                         "Using SHAPE method '%c' with parameters p1=%f and p2=%f",
150                         method,
151                         p1,
152                         p2);
153     }
154 
155     vrna_sc_add_SHAPE_deigan_ali(vc, shape_files, shape_file_association, p1, p2, constraint_type);
156     return;
157   }
158 }
159 
160 
161 PUBLIC int
vrna_sc_SHAPE_to_pr(const char * shape_conversion,double * values,int length,double default_value)162 vrna_sc_SHAPE_to_pr(const char  *shape_conversion,
163                     double      *values,
164                     int         length,
165                     double      default_value)
166 {
167   int *indices;
168   int i, j;
169   int index;
170   int ret = 1;
171 
172   if (!shape_conversion || !(*shape_conversion) || length <= 0)
173     return 0;
174 
175   if (*shape_conversion == 'S')
176     return 1;
177 
178   indices = vrna_alloc(sizeof(int) * (length + 1));
179   for (i = 1, j = 0; i <= length; ++i) {
180     if (values[i] < 0)
181       values[i] = default_value;
182     else
183       indices[j++] = i;
184   }
185 
186   if (*shape_conversion == 'M') {
187     double  max;
188     double  map_info[4][2] = { { 0.25, 0.35 },
189                                { 0.30, 0.55 },
190                                { 0.70, 0.85 },
191                                { 0,    1    } };
192 
193     max = values[1];
194     for (i = 2; i <= length; ++i)
195       max = MAX2(max, values[i]);
196     map_info[3][0] = max;
197 
198     for (i = 0; indices[i]; ++i) {
199       double  lower_source  = 0;
200       double  lower_target  = 0;
201 
202       index = indices[i];
203 
204       if (values[index] == 0)
205         continue;
206 
207       for (j = 0; j < 4; ++j) {
208         if (values[index] > lower_source && values[index] <= map_info[j][0]) {
209           double  diff_source = map_info[j][0] - lower_source;
210           double  diff_target = map_info[j][1] - lower_target;
211           values[index] = (values[index] - lower_source) / diff_source * diff_target + lower_target;
212           break;
213         }
214 
215         lower_source  = map_info[j][0];
216         lower_target  = map_info[j][1];
217       }
218     }
219   } else if (*shape_conversion == 'C') {
220     float cutoff = 0.25;
221     int   i;
222 
223     sscanf(shape_conversion + 1, "%f", &cutoff);
224 
225     for (i = 0; indices[i]; ++i) {
226       index         = indices[i];
227       values[index] = values[index] < cutoff ? 0 : 1;
228     }
229   } else if (*shape_conversion == 'L' || *shape_conversion == 'O') {
230     int   i;
231     float slope     = (*shape_conversion == 'L') ? 0.68 : 1.6;
232     float intercept = (*shape_conversion == 'L') ? 0.2 : -2.29;
233 
234     sc_parse_parameters(shape_conversion + 1, 's', 'i', &slope, &intercept);
235 
236     for (i = 0; indices[i]; ++i) {
237       double v;
238       index = indices[i];
239 
240       v             = (*shape_conversion == 'L') ? values[index] : log(values[index]);
241       values[index] = MAX2(MIN2((v - intercept) / slope, 1), 0);
242     }
243   } else {
244     ret = 0;
245   }
246 
247   free(indices);
248 
249   return ret;
250 }
251 
252 
253 PUBLIC int
vrna_sc_add_SHAPE_zarringhalam(vrna_fold_compound_t * vc,const double * reactivities,double b,double default_value,const char * shape_conversion,unsigned int options)254 vrna_sc_add_SHAPE_zarringhalam(vrna_fold_compound_t *vc,
255                                const double         *reactivities,
256                                double               b,
257                                double               default_value,
258                                const char           *shape_conversion,
259                                unsigned int         options)
260 {
261   int         i, j, n, ret;
262   double      *pr;
263   FLT_OR_DBL  *up, **bp;
264   vrna_md_t   *md;
265 
266   ret = 0; /* error */
267 
268   if (vc && reactivities && (vc->type == VRNA_FC_TYPE_SINGLE)) {
269     n   = vc->length;
270     md  = &(vc->params->model_details);
271 
272     /* first we copy over the reactivities to convert them into probabilities later on */
273     pr = (double *)vrna_alloc(sizeof(double) * (n + 1));
274     for (i = 0; i <= n; i++)
275       pr[i] = reactivities[i];
276 
277     if (vrna_sc_SHAPE_to_pr(shape_conversion, pr, n, default_value)) {
278       /*  now, convert them into pseudo free energies for unpaired, and
279        *  paired nucleotides
280        */
281       up  = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (n + 1));
282       bp  = (FLT_OR_DBL **)vrna_alloc(sizeof(FLT_OR_DBL *) * (n + 1));
283       for (i = 1; i <= n; ++i) {
284         up[i] = b * fabs(pr[i] - 1);
285         bp[i] = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (n + 1));
286         for (j = i + md->min_loop_size + 1; j <= n; ++j)
287           bp[i][j] = b * (pr[i] + pr[j]);
288       }
289 
290       /* add the pseudo energies as soft constraints */
291       vrna_sc_set_up(vc, (const FLT_OR_DBL *)up, options);
292       vrna_sc_set_bp(vc, (const FLT_OR_DBL **)bp, options);
293 
294       /* clean up memory */
295       for (i = 1; i <= n; ++i)
296         free(bp[i]);
297       free(bp);
298       free(up);
299 
300       ret = 1; /* success */
301     }
302 
303     free(pr);
304   }
305 
306   return ret;
307 }
308 
309 
310 PUBLIC int
vrna_sc_add_SHAPE_deigan(vrna_fold_compound_t * vc,const double * reactivities,double m,double b,unsigned int options)311 vrna_sc_add_SHAPE_deigan(vrna_fold_compound_t *vc,
312                          const double         *reactivities,
313                          double               m,
314                          double               b,
315                          unsigned int         options)
316 {
317   int         i;
318   FLT_OR_DBL  *values;
319 
320   if ((vc) &&
321       (reactivities)) {
322     switch (vc->type) {
323       case VRNA_FC_TYPE_SINGLE:
324         values = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (vc->length + 1));
325 
326         /* first convert the values according to provided slope and intercept values */
327         for (i = 1; i <= vc->length; ++i)
328           values[i] = conversion_deigan(reactivities[i], m, b);
329 
330         /* always store soft constraints in plain format */
331         vrna_sc_set_stack(vc, (const FLT_OR_DBL *)values, options);
332         free(values);
333 
334         return 1; /* success */
335 
336       case VRNA_FC_TYPE_COMPARATIVE:
337         vrna_message_warning("vrna_sc_add_SHAPE_deigan() not implemented for comparative prediction! "
338                              "Use vrna_sc_add_SHAPE_deigan_ali() instead!");
339         break;
340     }
341   }
342 
343   return 0; /* error */
344 }
345 
346 
347 PUBLIC int
vrna_sc_add_SHAPE_deigan_ali(vrna_fold_compound_t * vc,const char ** shape_files,const int * shape_file_association,double m,double b,unsigned int options)348 vrna_sc_add_SHAPE_deigan_ali(vrna_fold_compound_t *vc,
349                              const char           **shape_files,
350                              const int            *shape_file_association,
351                              double               m,
352                              double               b,
353                              unsigned int         options)
354 {
355   FILE          *fp;
356   float         reactivity, *reactivities, weight;
357   char          *line, nucleotide, *sequence;
358   int           s, i, r, n_data, position, n_seq, ret;
359   FLT_OR_DBL    **contributions, energy;
360   unsigned int  **a2s;
361 
362   ret = 0;
363 
364   if (vc && (vc->type == VRNA_FC_TYPE_COMPARATIVE)) {
365     n_seq = vc->n_seq;
366     a2s   = vc->a2s;
367 
368     vrna_sc_init(vc);
369 
370     /* count number of SHAPE data available for this alignment */
371     for (n_data = s = 0; shape_file_association[s] != -1; s++) {
372       if (shape_file_association[s] >= n_seq)
373         continue;
374 
375       /* try opening the shape data file */
376       if ((fp = fopen(shape_files[s], "r"))) {
377         fclose(fp);
378         n_data++;
379       }
380     }
381 
382     weight = (n_data > 0) ? ((float)n_seq / (float)n_data) : 0.;
383 
384     /* collect contributions for the sequences in the alignment */
385     contributions = (FLT_OR_DBL **)vrna_alloc(sizeof(FLT_OR_DBL *) * (n_seq));
386 
387     for (s = 0; shape_file_association[s] != -1; s++) {
388       int ss = shape_file_association[s]; /* actual sequence number in alignment */
389 
390       if (ss >= n_seq) {
391         vrna_message_warning("Failed to associate SHAPE file \"%s\" with sequence %d in alignment! "
392                              "Alignment has only %d sequences!",
393                              shape_files[s],
394                              ss,
395                              n_seq);
396         continue;
397       }
398 
399       /* read the shape file */
400       if (!(fp = fopen(shape_files[s], "r"))) {
401         vrna_message_warning("Failed to open SHAPE data file \"%d\"! "
402                              "No shape data will be used for sequence %d.",
403                              s,
404                              ss + 1);
405       } else {
406         reactivities  = (float *)vrna_alloc(sizeof(float) * (vc->length + 1));
407         sequence      = (char *)vrna_alloc(sizeof(char) * (vc->length + 1));
408 
409         /* initialize reactivities with missing data for entire alignment length */
410         for (i = 1; i <= vc->length; i++)
411           reactivities[i] = -1.;
412 
413         while ((line = vrna_read_line(fp))) {
414           r = sscanf(line, "%d %c %f", &position, &nucleotide, &reactivity);
415           if (r) {
416             if (position <= 0) {
417               vrna_message_warning("SHAPE data for position %d outside alignment!", position);
418             } else if (position > vc->length) {
419               vrna_message_warning("SHAPE data for position %d outside alignment!", position);
420             } else {
421               switch (r) {
422                 case 1:
423                   nucleotide = 'N';
424                 /* fall through */
425                 case 2:
426                   reactivity = -1.;
427                 /* fall through */
428                 default:
429                   sequence[position - 1]  = nucleotide;
430                   reactivities[position]  = reactivity;
431                   break;
432               }
433             }
434           }
435 
436           free(line);
437         }
438         fclose(fp);
439 
440         sequence[vc->length] = '\0';
441 
442         /* double check information by comparing the sequence read from */
443         char *tmp_seq = vrna_seq_ungapped(vc->sequences[shape_file_association[s]]);
444         if (strcmp(tmp_seq, sequence))
445           vrna_message_warning("Input sequence %d differs from sequence provided via SHAPE file!",
446                                shape_file_association[s] + 1);
447 
448         free(tmp_seq);
449 
450         /*  begin preparation of the pseudo energies */
451         /*  beware of the fact that energy_stack will be accessed through a2s[s] array,
452          *  hence pseudo_energy might be gap-free (default)
453          */
454         int gaps, is_gap;
455         contributions[ss] = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (vc->length + 1));
456         for (gaps = 0, i = 1; i <= vc->length; i++) {
457           is_gap  = (vc->sequences[ss][i - 1] == '-') ? 1 : 0;
458           energy  = ((i - gaps > 0) && !(is_gap)) ? conversion_deigan(reactivities[i - gaps], m, b) * weight : 0.;
459 
460           if (vc->params->model_details.oldAliEn) {
461             contributions[ss][i] = energy;
462           } else if (!is_gap) {
463             contributions[ss][a2s[ss][i]] = energy;
464           }
465 
466           gaps += is_gap;
467         }
468 
469         free(reactivities);
470       }
471 
472 
473     }
474 
475     ret = vrna_sc_set_stack_comparative(vc, (const FLT_OR_DBL **)contributions, options);
476 
477     for (s = 0; s < n_seq; s++)
478       free(contributions[s]);
479 
480     free(contributions);
481 
482   }
483 
484   return ret;
485 }
486 
487 
488 PUBLIC int
vrna_sc_SHAPE_parse_method(const char * method_string,char * method,float * param_1,float * param_2)489 vrna_sc_SHAPE_parse_method(const char *method_string,
490                            char       *method,
491                            float      *param_1,
492                            float      *param_2)
493 {
494   const char *params = method_string + 1;
495 
496   *param_1  = 0;
497   *param_2  = 0;
498 
499   if (!method_string || !method_string[0])
500     return 0;
501 
502   *method = method_string[0];
503 
504   switch (method_string[0]) {
505     case 'Z':
506       *param_1 = 0.89;
507       sc_parse_parameters(params, 'b', '\0', param_1, NULL);
508       break;
509 
510     case 'D':
511       *param_1  = 1.8;
512       *param_2  = -0.6;
513       sc_parse_parameters(params, 'm', 'b', param_1, param_2);
514       break;
515 
516     case 'W':
517       break;
518 
519     default:
520       *method = 0;
521       return 0;
522   }
523 
524   return 1;
525 }
526 
527 
528 PRIVATE void
sc_parse_parameters(const char * string,char c1,char c2,float * v1,float * v2)529 sc_parse_parameters(const char  *string,
530                     char        c1,
531                     char        c2,
532                     float       *v1,
533                     float       *v2)
534 {
535   char        *fmt;
536   const char  warning[] = "SHAPE method parameters not recognized! Using default parameters!";
537   int         r;
538 
539   assert(c1);
540   assert(v1);
541 
542   if (!string || !(*string))
543     return;
544 
545   if (c2 == 0 || v2 == NULL) {
546     fmt = vrna_strdup_printf("%c%%f", c1);
547     r   = sscanf(string, fmt, v1);
548 
549     if (!r)
550       vrna_message_warning(warning);
551 
552     free(fmt);
553 
554     return;
555   }
556 
557   fmt = vrna_strdup_printf("%c%%f%c%%f", c1, c2);
558   r   = sscanf(string, fmt, v1, v2);
559 
560   if (r != 2) {
561     free(fmt);
562     fmt = vrna_strdup_printf("%c%%f", c1);
563     r   = sscanf(string, fmt, v1);
564 
565     if (!r) {
566       free(fmt);
567       fmt = vrna_strdup_printf("%c%%f", c2);
568       r   = sscanf(string, fmt, v2);
569 
570       if (!r)
571         vrna_message_warning(warning);
572     }
573   }
574 
575   free(fmt);
576 }
577 
578 
579 PRIVATE FLT_OR_DBL
conversion_deigan(double reactivity,double m,double b)580 conversion_deigan(double reactivity,
581                   double m,
582                   double b)
583 {
584   return reactivity < 0 ? 0. : (FLT_OR_DBL)(m * log(reactivity + 1) + b);
585 }
586 
587