1 /* SHAPE reactivity data handling */
2
3 #ifdef HAVE_CONFIG_H
4 #include "config.h"
5 #endif
6
7 #include <assert.h>
8 #include <stdio.h>
9 #include <stdlib.h>
10 #include <math.h>
11 #include <ctype.h>
12 #include <string.h>
13 #include <limits.h>
14
15 #include "ViennaRNA/params/default.h"
16 #include "ViennaRNA/params/constants.h" /* defines MINPSCORE */
17 #include "ViennaRNA/fold_vars.h"
18 #include "ViennaRNA/utils/basic.h"
19 #include "ViennaRNA/utils/strings.h"
20 #include "ViennaRNA/utils/alignments.h"
21 #include "ViennaRNA/io/utils.h"
22 #include "ViennaRNA/io/file_formats.h"
23 #include "ViennaRNA/params/basic.h"
24 #include "ViennaRNA/constraints/soft.h"
25 #include "ViennaRNA/constraints/SHAPE.h"
26
27 /*
28 #################################
29 # GLOBAL VARIABLES #
30 #################################
31 */
32
33 /*
34 #################################
35 # PRIVATE VARIABLES #
36 #################################
37 */
38
39 /*
40 #################################
41 # PRIVATE FUNCTION DECLARATIONS #
42 #################################
43 */
44 PRIVATE void
45 sc_parse_parameters(const char *string,
46 char c1,
47 char c2,
48 float *v1,
49 float *v2);
50
51
52 PRIVATE FLT_OR_DBL
53 conversion_deigan(double reactivity,
54 double m,
55 double b);
56
57
58 /*
59 #################################
60 # BEGIN OF FUNCTION DEFINITIONS #
61 #################################
62 */
63 PUBLIC void
vrna_constraints_add_SHAPE(vrna_fold_compound_t * vc,const char * shape_file,const char * shape_method,const char * shape_conversion,int verbose,unsigned int constraint_type)64 vrna_constraints_add_SHAPE(vrna_fold_compound_t *vc,
65 const char *shape_file,
66 const char *shape_method,
67 const char *shape_conversion,
68 int verbose,
69 unsigned int constraint_type)
70 {
71 float p1, p2;
72 char method;
73 char *sequence;
74 double *values;
75 int i, length = vc->length;
76
77 if (!vrna_sc_SHAPE_parse_method(shape_method, &method, &p1, &p2)) {
78 vrna_message_warning("Method for SHAPE reactivity data conversion not recognized!");
79 return;
80 }
81
82 if (verbose) {
83 if (method != 'W') {
84 if (method == 'Z') {
85 vrna_message_info(stderr, "Using SHAPE method '%c' with parameter p1=%f", method, p1);
86 } else {
87 vrna_message_info(stderr,
88 "Using SHAPE method '%c' with parameters p1=%f and p2=%f",
89 method,
90 p1,
91 p2);
92 }
93 }
94 }
95
96 sequence = vrna_alloc(sizeof(char) * (length + 1));
97 values = vrna_alloc(sizeof(double) * (length + 1));
98 vrna_file_SHAPE_read(shape_file, length, method == 'W' ? 0 : -1, sequence, values);
99
100 if (method == 'D') {
101 (void)vrna_sc_add_SHAPE_deigan(vc, (const double *)values, p1, p2, constraint_type);
102 } else if (method == 'Z') {
103 (void)vrna_sc_add_SHAPE_zarringhalam(vc,
104 (const double *)values,
105 p1,
106 0.5,
107 shape_conversion,
108 constraint_type);
109 } else {
110 assert(method == 'W');
111 FLT_OR_DBL *v = vrna_alloc(sizeof(FLT_OR_DBL) * (length + 1));
112 for (i = 0; i < length; i++)
113 v[i] = values[i];
114
115 vrna_sc_set_up(vc, v, constraint_type);
116
117 free(v);
118 }
119
120 free(values);
121 free(sequence);
122 }
123
124
125 PUBLIC void
vrna_constraints_add_SHAPE_ali(vrna_fold_compound_t * vc,const char * shape_method,const char ** shape_files,const int * shape_file_association,int verbose,unsigned int constraint_type)126 vrna_constraints_add_SHAPE_ali(vrna_fold_compound_t *vc,
127 const char *shape_method,
128 const char **shape_files,
129 const int *shape_file_association,
130 int verbose,
131 unsigned int constraint_type)
132 {
133 float p1, p2;
134 char method;
135
136 if (!vrna_sc_SHAPE_parse_method(shape_method, &method, &p1, &p2)) {
137 vrna_message_warning("Method for SHAPE reactivity data conversion not recognized!");
138 return;
139 }
140
141 if (method != 'D') {
142 vrna_message_warning("SHAPE method %c not implemented for comparative prediction!",
143 method);
144 vrna_message_warning("Ignoring SHAPE reactivity data!");
145 return;
146 } else {
147 if (verbose) {
148 vrna_message_info(stderr,
149 "Using SHAPE method '%c' with parameters p1=%f and p2=%f",
150 method,
151 p1,
152 p2);
153 }
154
155 vrna_sc_add_SHAPE_deigan_ali(vc, shape_files, shape_file_association, p1, p2, constraint_type);
156 return;
157 }
158 }
159
160
161 PUBLIC int
vrna_sc_SHAPE_to_pr(const char * shape_conversion,double * values,int length,double default_value)162 vrna_sc_SHAPE_to_pr(const char *shape_conversion,
163 double *values,
164 int length,
165 double default_value)
166 {
167 int *indices;
168 int i, j;
169 int index;
170 int ret = 1;
171
172 if (!shape_conversion || !(*shape_conversion) || length <= 0)
173 return 0;
174
175 if (*shape_conversion == 'S')
176 return 1;
177
178 indices = vrna_alloc(sizeof(int) * (length + 1));
179 for (i = 1, j = 0; i <= length; ++i) {
180 if (values[i] < 0)
181 values[i] = default_value;
182 else
183 indices[j++] = i;
184 }
185
186 if (*shape_conversion == 'M') {
187 double max;
188 double map_info[4][2] = { { 0.25, 0.35 },
189 { 0.30, 0.55 },
190 { 0.70, 0.85 },
191 { 0, 1 } };
192
193 max = values[1];
194 for (i = 2; i <= length; ++i)
195 max = MAX2(max, values[i]);
196 map_info[3][0] = max;
197
198 for (i = 0; indices[i]; ++i) {
199 double lower_source = 0;
200 double lower_target = 0;
201
202 index = indices[i];
203
204 if (values[index] == 0)
205 continue;
206
207 for (j = 0; j < 4; ++j) {
208 if (values[index] > lower_source && values[index] <= map_info[j][0]) {
209 double diff_source = map_info[j][0] - lower_source;
210 double diff_target = map_info[j][1] - lower_target;
211 values[index] = (values[index] - lower_source) / diff_source * diff_target + lower_target;
212 break;
213 }
214
215 lower_source = map_info[j][0];
216 lower_target = map_info[j][1];
217 }
218 }
219 } else if (*shape_conversion == 'C') {
220 float cutoff = 0.25;
221 int i;
222
223 sscanf(shape_conversion + 1, "%f", &cutoff);
224
225 for (i = 0; indices[i]; ++i) {
226 index = indices[i];
227 values[index] = values[index] < cutoff ? 0 : 1;
228 }
229 } else if (*shape_conversion == 'L' || *shape_conversion == 'O') {
230 int i;
231 float slope = (*shape_conversion == 'L') ? 0.68 : 1.6;
232 float intercept = (*shape_conversion == 'L') ? 0.2 : -2.29;
233
234 sc_parse_parameters(shape_conversion + 1, 's', 'i', &slope, &intercept);
235
236 for (i = 0; indices[i]; ++i) {
237 double v;
238 index = indices[i];
239
240 v = (*shape_conversion == 'L') ? values[index] : log(values[index]);
241 values[index] = MAX2(MIN2((v - intercept) / slope, 1), 0);
242 }
243 } else {
244 ret = 0;
245 }
246
247 free(indices);
248
249 return ret;
250 }
251
252
253 PUBLIC int
vrna_sc_add_SHAPE_zarringhalam(vrna_fold_compound_t * vc,const double * reactivities,double b,double default_value,const char * shape_conversion,unsigned int options)254 vrna_sc_add_SHAPE_zarringhalam(vrna_fold_compound_t *vc,
255 const double *reactivities,
256 double b,
257 double default_value,
258 const char *shape_conversion,
259 unsigned int options)
260 {
261 int i, j, n, ret;
262 double *pr;
263 FLT_OR_DBL *up, **bp;
264 vrna_md_t *md;
265
266 ret = 0; /* error */
267
268 if (vc && reactivities && (vc->type == VRNA_FC_TYPE_SINGLE)) {
269 n = vc->length;
270 md = &(vc->params->model_details);
271
272 /* first we copy over the reactivities to convert them into probabilities later on */
273 pr = (double *)vrna_alloc(sizeof(double) * (n + 1));
274 for (i = 0; i <= n; i++)
275 pr[i] = reactivities[i];
276
277 if (vrna_sc_SHAPE_to_pr(shape_conversion, pr, n, default_value)) {
278 /* now, convert them into pseudo free energies for unpaired, and
279 * paired nucleotides
280 */
281 up = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (n + 1));
282 bp = (FLT_OR_DBL **)vrna_alloc(sizeof(FLT_OR_DBL *) * (n + 1));
283 for (i = 1; i <= n; ++i) {
284 up[i] = b * fabs(pr[i] - 1);
285 bp[i] = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (n + 1));
286 for (j = i + md->min_loop_size + 1; j <= n; ++j)
287 bp[i][j] = b * (pr[i] + pr[j]);
288 }
289
290 /* add the pseudo energies as soft constraints */
291 vrna_sc_set_up(vc, (const FLT_OR_DBL *)up, options);
292 vrna_sc_set_bp(vc, (const FLT_OR_DBL **)bp, options);
293
294 /* clean up memory */
295 for (i = 1; i <= n; ++i)
296 free(bp[i]);
297 free(bp);
298 free(up);
299
300 ret = 1; /* success */
301 }
302
303 free(pr);
304 }
305
306 return ret;
307 }
308
309
310 PUBLIC int
vrna_sc_add_SHAPE_deigan(vrna_fold_compound_t * vc,const double * reactivities,double m,double b,unsigned int options)311 vrna_sc_add_SHAPE_deigan(vrna_fold_compound_t *vc,
312 const double *reactivities,
313 double m,
314 double b,
315 unsigned int options)
316 {
317 int i;
318 FLT_OR_DBL *values;
319
320 if ((vc) &&
321 (reactivities)) {
322 switch (vc->type) {
323 case VRNA_FC_TYPE_SINGLE:
324 values = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (vc->length + 1));
325
326 /* first convert the values according to provided slope and intercept values */
327 for (i = 1; i <= vc->length; ++i)
328 values[i] = conversion_deigan(reactivities[i], m, b);
329
330 /* always store soft constraints in plain format */
331 vrna_sc_set_stack(vc, (const FLT_OR_DBL *)values, options);
332 free(values);
333
334 return 1; /* success */
335
336 case VRNA_FC_TYPE_COMPARATIVE:
337 vrna_message_warning("vrna_sc_add_SHAPE_deigan() not implemented for comparative prediction! "
338 "Use vrna_sc_add_SHAPE_deigan_ali() instead!");
339 break;
340 }
341 }
342
343 return 0; /* error */
344 }
345
346
347 PUBLIC int
vrna_sc_add_SHAPE_deigan_ali(vrna_fold_compound_t * vc,const char ** shape_files,const int * shape_file_association,double m,double b,unsigned int options)348 vrna_sc_add_SHAPE_deigan_ali(vrna_fold_compound_t *vc,
349 const char **shape_files,
350 const int *shape_file_association,
351 double m,
352 double b,
353 unsigned int options)
354 {
355 FILE *fp;
356 float reactivity, *reactivities, weight;
357 char *line, nucleotide, *sequence;
358 int s, i, r, n_data, position, n_seq, ret;
359 FLT_OR_DBL **contributions, energy;
360 unsigned int **a2s;
361
362 ret = 0;
363
364 if (vc && (vc->type == VRNA_FC_TYPE_COMPARATIVE)) {
365 n_seq = vc->n_seq;
366 a2s = vc->a2s;
367
368 vrna_sc_init(vc);
369
370 /* count number of SHAPE data available for this alignment */
371 for (n_data = s = 0; shape_file_association[s] != -1; s++) {
372 if (shape_file_association[s] >= n_seq)
373 continue;
374
375 /* try opening the shape data file */
376 if ((fp = fopen(shape_files[s], "r"))) {
377 fclose(fp);
378 n_data++;
379 }
380 }
381
382 weight = (n_data > 0) ? ((float)n_seq / (float)n_data) : 0.;
383
384 /* collect contributions for the sequences in the alignment */
385 contributions = (FLT_OR_DBL **)vrna_alloc(sizeof(FLT_OR_DBL *) * (n_seq));
386
387 for (s = 0; shape_file_association[s] != -1; s++) {
388 int ss = shape_file_association[s]; /* actual sequence number in alignment */
389
390 if (ss >= n_seq) {
391 vrna_message_warning("Failed to associate SHAPE file \"%s\" with sequence %d in alignment! "
392 "Alignment has only %d sequences!",
393 shape_files[s],
394 ss,
395 n_seq);
396 continue;
397 }
398
399 /* read the shape file */
400 if (!(fp = fopen(shape_files[s], "r"))) {
401 vrna_message_warning("Failed to open SHAPE data file \"%d\"! "
402 "No shape data will be used for sequence %d.",
403 s,
404 ss + 1);
405 } else {
406 reactivities = (float *)vrna_alloc(sizeof(float) * (vc->length + 1));
407 sequence = (char *)vrna_alloc(sizeof(char) * (vc->length + 1));
408
409 /* initialize reactivities with missing data for entire alignment length */
410 for (i = 1; i <= vc->length; i++)
411 reactivities[i] = -1.;
412
413 while ((line = vrna_read_line(fp))) {
414 r = sscanf(line, "%d %c %f", &position, &nucleotide, &reactivity);
415 if (r) {
416 if (position <= 0) {
417 vrna_message_warning("SHAPE data for position %d outside alignment!", position);
418 } else if (position > vc->length) {
419 vrna_message_warning("SHAPE data for position %d outside alignment!", position);
420 } else {
421 switch (r) {
422 case 1:
423 nucleotide = 'N';
424 /* fall through */
425 case 2:
426 reactivity = -1.;
427 /* fall through */
428 default:
429 sequence[position - 1] = nucleotide;
430 reactivities[position] = reactivity;
431 break;
432 }
433 }
434 }
435
436 free(line);
437 }
438 fclose(fp);
439
440 sequence[vc->length] = '\0';
441
442 /* double check information by comparing the sequence read from */
443 char *tmp_seq = vrna_seq_ungapped(vc->sequences[shape_file_association[s]]);
444 if (strcmp(tmp_seq, sequence))
445 vrna_message_warning("Input sequence %d differs from sequence provided via SHAPE file!",
446 shape_file_association[s] + 1);
447
448 free(tmp_seq);
449
450 /* begin preparation of the pseudo energies */
451 /* beware of the fact that energy_stack will be accessed through a2s[s] array,
452 * hence pseudo_energy might be gap-free (default)
453 */
454 int gaps, is_gap;
455 contributions[ss] = (FLT_OR_DBL *)vrna_alloc(sizeof(FLT_OR_DBL) * (vc->length + 1));
456 for (gaps = 0, i = 1; i <= vc->length; i++) {
457 is_gap = (vc->sequences[ss][i - 1] == '-') ? 1 : 0;
458 energy = ((i - gaps > 0) && !(is_gap)) ? conversion_deigan(reactivities[i - gaps], m, b) * weight : 0.;
459
460 if (vc->params->model_details.oldAliEn) {
461 contributions[ss][i] = energy;
462 } else if (!is_gap) {
463 contributions[ss][a2s[ss][i]] = energy;
464 }
465
466 gaps += is_gap;
467 }
468
469 free(reactivities);
470 }
471
472
473 }
474
475 ret = vrna_sc_set_stack_comparative(vc, (const FLT_OR_DBL **)contributions, options);
476
477 for (s = 0; s < n_seq; s++)
478 free(contributions[s]);
479
480 free(contributions);
481
482 }
483
484 return ret;
485 }
486
487
488 PUBLIC int
vrna_sc_SHAPE_parse_method(const char * method_string,char * method,float * param_1,float * param_2)489 vrna_sc_SHAPE_parse_method(const char *method_string,
490 char *method,
491 float *param_1,
492 float *param_2)
493 {
494 const char *params = method_string + 1;
495
496 *param_1 = 0;
497 *param_2 = 0;
498
499 if (!method_string || !method_string[0])
500 return 0;
501
502 *method = method_string[0];
503
504 switch (method_string[0]) {
505 case 'Z':
506 *param_1 = 0.89;
507 sc_parse_parameters(params, 'b', '\0', param_1, NULL);
508 break;
509
510 case 'D':
511 *param_1 = 1.8;
512 *param_2 = -0.6;
513 sc_parse_parameters(params, 'm', 'b', param_1, param_2);
514 break;
515
516 case 'W':
517 break;
518
519 default:
520 *method = 0;
521 return 0;
522 }
523
524 return 1;
525 }
526
527
528 PRIVATE void
sc_parse_parameters(const char * string,char c1,char c2,float * v1,float * v2)529 sc_parse_parameters(const char *string,
530 char c1,
531 char c2,
532 float *v1,
533 float *v2)
534 {
535 char *fmt;
536 const char warning[] = "SHAPE method parameters not recognized! Using default parameters!";
537 int r;
538
539 assert(c1);
540 assert(v1);
541
542 if (!string || !(*string))
543 return;
544
545 if (c2 == 0 || v2 == NULL) {
546 fmt = vrna_strdup_printf("%c%%f", c1);
547 r = sscanf(string, fmt, v1);
548
549 if (!r)
550 vrna_message_warning(warning);
551
552 free(fmt);
553
554 return;
555 }
556
557 fmt = vrna_strdup_printf("%c%%f%c%%f", c1, c2);
558 r = sscanf(string, fmt, v1, v2);
559
560 if (r != 2) {
561 free(fmt);
562 fmt = vrna_strdup_printf("%c%%f", c1);
563 r = sscanf(string, fmt, v1);
564
565 if (!r) {
566 free(fmt);
567 fmt = vrna_strdup_printf("%c%%f", c2);
568 r = sscanf(string, fmt, v2);
569
570 if (!r)
571 vrna_message_warning(warning);
572 }
573 }
574
575 free(fmt);
576 }
577
578
579 PRIVATE FLT_OR_DBL
conversion_deigan(double reactivity,double m,double b)580 conversion_deigan(double reactivity,
581 double m,
582 double b)
583 {
584 return reactivity < 0 ? 0. : (FLT_OR_DBL)(m * log(reactivity + 1) + b);
585 }
586
587