1 /*
2  *              Ineractive access to suboptimal folding
3  *
4  *                         c Ivo L Hofacker
5  *                        Vienna RNA package
6  */
7 
8 #ifdef HAVE_CONFIG_H
9 #include "config.h"
10 #endif
11 
12 #include <config.h>
13 #include <assert.h>
14 #include <stdio.h>
15 #include <stdlib.h>
16 #include <unistd.h>
17 #include <math.h>
18 #include <ctype.h>
19 #include <string.h>
20 #include "ViennaRNA/part_func.h"
21 #include "ViennaRNA/fold.h"
22 #include "ViennaRNA/cofold.h"
23 #include "ViennaRNA/fold_vars.h"
24 #include "ViennaRNA/utils/basic.h"
25 #include "ViennaRNA/utils/strings.h"
26 #include "ViennaRNA/params/io.h"
27 #include "ViennaRNA/subopt.h"
28 #include "ViennaRNA/params/basic.h"
29 #include "ViennaRNA/constraints/basic.h"
30 #include "ViennaRNA/constraints/SHAPE.h"
31 #include "ViennaRNA/io/file_formats.h"
32 #include "ViennaRNA/io/utils.h"
33 #include "ViennaRNA/commands.h"
34 #include "RNAsubopt_cmdl.h"
35 #include "gengetopt_helper.h"
36 #include "input_id_helpers.h"
37 
38 #include "ViennaRNA/color_output.inc"
39 
40 PRIVATE void putoutzuker(FILE                   *output,
41                          vrna_subopt_solution_t *zukersolution);
42 
43 
44 struct nr_en_data {
45   FILE                  *output;
46   vrna_fold_compound_t  *fc;
47   double                kT;
48   double                ens_en;
49 };
50 
51 
52 PRIVATE void
53 print_samples(const char  *structure,
54               void        *data);
55 
56 
57 PRIVATE void
58 print_samples_en(const char *structure,
59                  void       *data);
60 
61 
62 int
main(int argc,char * argv[])63 main(int  argc,
64      char *argv[])
65 {
66   FILE                                *input, *output;
67   struct          RNAsubopt_args_info args_info;
68   char                                *rec_sequence, *rec_id,
69                                       **rec_rest, *orig_sequence, *constraints_file, *cstruc,
70                                       *structure, *shape_file, *shape_method, *shape_conversion,
71                                       *infile, *outfile, *filename_delim;
72   unsigned int                        rec_type, read_opt;
73   int                                 i, length, cl, istty, delta, n_back, noconv, dos, zuker,
74                                       with_shapes, verbose, enforceConstraints, st_back_en, batch,
75                                       tofile, filename_full, canonicalBPonly, nonRedundant;
76   double                              deltap;
77   vrna_md_t                           md;
78   dataset_id                          id_control;
79   vrna_cmd_t                          commands;
80 
81   do_backtrack    = 1;
82   delta           = 100;
83   deltap          = n_back = noconv = dos = zuker = 0;
84   rec_type        = read_opt = 0;
85   rec_id          = rec_sequence = orig_sequence = NULL;
86   rec_rest        = NULL;
87   cstruc          = structure = NULL;
88   verbose         = 0;
89   st_back_en      = 0;
90   infile          = NULL;
91   outfile         = NULL;
92   output          = NULL;
93   tofile          = 0;
94   filename_full   = 0;
95   canonicalBPonly = 0;
96   commands        = NULL;
97   nonRedundant    = 0;
98 
99   set_model_details(&md);
100 
101   /* switch on unique multibranch loop decomposition */
102   md.uniq_ML = 1;
103 
104   /*
105    #############################################
106    # check the command line parameters
107    #############################################
108    */
109   if (RNAsubopt_cmdline_parser(argc, argv, &args_info) != 0)
110     exit(1);
111 
112   /* parse options for ID manipulation */
113   ggo_get_id_control(args_info, id_control, "Sequence", "sequence", "_", 4, 1);
114 
115   /* get basic set of model details */
116   ggo_get_md_eval(args_info, md);
117   ggo_get_md_fold(args_info, md);
118   ggo_get_md_part(args_info, md);
119   ggo_get_circ(args_info, md.circ);
120 
121   /* temperature */
122   ggo_get_temperature(args_info, md.temperature);
123 
124   /* check dangle model */
125   if ((md.dangles < 0) || (md.dangles > 3)) {
126     vrna_message_warning("required dangle model not implemented, falling back to default dangles=2");
127     md.dangles = dangles = 2;
128   }
129 
130   /* SHAPE reactivity data */
131   ggo_get_SHAPE(args_info, with_shapes, shape_file, shape_method, shape_conversion);
132 
133   ggo_get_constraints_settings(args_info,
134                                fold_constrained,
135                                constraints_file,
136                                enforceConstraints,
137                                batch);
138 
139   if (args_info.verbose_given)
140     verbose = 1;
141 
142   /* enforce canonical base pairs in any case? */
143   if (args_info.canonicalBPonly_given)
144     canonicalBPonly = 1;
145 
146   /* do not convert DNA nucleotide "T" to appropriate RNA "U" */
147   if (args_info.noconv_given)
148     noconv = 1;
149 
150   /* energy range */
151   if (args_info.deltaEnergy_given)
152     delta = (int)(0.1 + args_info.deltaEnergy_arg * 100);
153 
154   /* energy range after post evaluation */
155   if (args_info.deltaEnergyPost_given)
156     deltap = args_info.deltaEnergyPost_arg;
157 
158   /* sorted output */
159   if (args_info.sorted_given) {
160     subopt_sorted = VRNA_SORT_BY_ENERGY_LEXICOGRAPHIC_ASC;
161     if (args_info.en_only_given)
162       subopt_sorted = VRNA_SORT_BY_ENERGY_ASC;
163   }
164 
165   /* stochastic backtracking */
166   if (args_info.stochBT_given) {
167     n_back = args_info.stochBT_arg;
168     vrna_init_rand();
169     md.compute_bpp = 0;
170   }
171 
172   if (args_info.stochBT_en_given) {
173     n_back          = args_info.stochBT_en_arg;
174     st_back_en      = 1;
175     md.compute_bpp  = 0;
176     vrna_init_rand();
177   }
178 
179   /* density of states */
180   if (args_info.dos_given) {
181     dos           = 1;
182     print_energy  = -999999;
183   }
184 
185   /* logarithmic multiloop energies */
186   if (args_info.logML_given)
187     md.logML = logML = 1;
188 
189   /* zuker subopts */
190   if (args_info.zuker_given)
191     zuker = 1;
192 
193   if (zuker) {
194     if (md.circ) {
195       vrna_message_warning("Sorry, zuker subopts not yet implemented for circfold");
196       RNAsubopt_cmdline_parser_print_help();
197       exit(1);
198     } else if (n_back > 0) {
199       vrna_message_warning("Can't do zuker subopts and stochastic subopts at the same time");
200       RNAsubopt_cmdline_parser_print_help();
201       exit(1);
202     } else if (md.gquad) {
203       vrna_message_warning("G-quadruplex support for Zuker subopts not implemented yet");
204       RNAsubopt_cmdline_parser_print_help();
205       exit(1);
206     }
207   }
208 
209   if (md.gquad && (n_back > 0)) {
210     vrna_message_warning("G-quadruplex support for stochastic backtracking not implemented yet");
211     RNAsubopt_cmdline_parser_print_help();
212     exit(1);
213   }
214 
215   if (args_info.infile_given)
216     infile = strdup(args_info.infile_arg);
217 
218   if (args_info.outfile_given) {
219     tofile = 1;
220     if (args_info.outfile_arg)
221       outfile = strdup(args_info.outfile_arg);
222   }
223 
224   /* filename sanitize delimiter */
225   if (args_info.filename_delim_given)
226     filename_delim = strdup(args_info.filename_delim_arg);
227   else if (get_id_delim(id_control))
228     filename_delim = strdup(get_id_delim(id_control));
229   else
230     filename_delim = NULL;
231 
232   if ((filename_delim) && isspace(*filename_delim)) {
233     free(filename_delim);
234     filename_delim = NULL;
235   }
236 
237   /* full filename from FASTA header support */
238   if (args_info.filename_full_given)
239     filename_full = 1;
240 
241   /* non-redundant backtracing */
242   if (args_info.nonRedundant_given)
243     nonRedundant = 1;
244 
245   if (args_info.commands_given)
246     commands = vrna_file_commands_read(args_info.commands_arg,
247                                        VRNA_CMD_PARSE_HC | VRNA_CMD_PARSE_SC);
248 
249   /* free allocated memory of command line data structure */
250   RNAsubopt_cmdline_parser_free(&args_info);
251 
252   /*
253    #############################################
254    # begin initializing
255    #############################################
256    */
257 
258   if (infile) {
259     input = fopen((const char *)infile, "r");
260     if (!input)
261       vrna_message_error("Could not read input file");
262   } else {
263     input = stdin;
264   }
265 
266   istty = (!infile) && isatty(fileno(stdout)) && isatty(fileno(stdin));
267 
268   /* print user help if we get input from tty */
269   if (istty) {
270     if (!zuker)
271       print_comment(stdout, "Use '&' to connect 2 sequences that shall form a complex.");
272 
273     if (fold_constrained) {
274       vrna_message_constraint_options(
275         VRNA_CONSTRAINT_DB_DOT | VRNA_CONSTRAINT_DB_X | VRNA_CONSTRAINT_DB_ANG_BRACK |
276         VRNA_CONSTRAINT_DB_RND_BRACK);
277       vrna_message_input_seq("Input sequence (upper or lower case) followed by structure constraint");
278     } else {
279       vrna_message_input_seq_simple();
280     }
281   }
282 
283   /* set options we wanna pass to vrna_file_fasta_read_record() */
284   if (istty)
285     read_opt |= VRNA_INPUT_NOSKIP_BLANK_LINES;
286 
287   if (!fold_constrained)
288     read_opt |= VRNA_INPUT_NO_REST;
289 
290   /*
291    #############################################
292    # main loop: continue until end of file
293    #############################################
294    */
295   while (
296     !((rec_type = vrna_file_fasta_read_record(&rec_id, &rec_sequence, &rec_rest, input, read_opt))
297       & (VRNA_INPUT_ERROR | VRNA_INPUT_QUIT))) {
298     char  *SEQ_ID         = NULL;
299     char  *v_file_name    = NULL;
300     char  *tmp_string     = NULL;
301     int   maybe_multiline = 0;
302 
303     /*
304      ########################################################
305      # init everything according to the data we've read
306      ########################################################
307      */
308     if (rec_id) {
309       maybe_multiline = 1;
310       /* remove '>' from FASTA header */
311       rec_id = memmove(rec_id, rec_id + 1, strlen(rec_id));
312     }
313 
314     /* construct the sequence ID */
315     set_next_id(&rec_id, id_control);
316     SEQ_ID = fileprefix_from_id(rec_id, id_control, filename_full);
317 
318     if (tofile) {
319       /* prepare the file name */
320       if (outfile)
321         v_file_name = vrna_strdup_printf("%s", outfile);
322       else
323         v_file_name = (SEQ_ID) ?
324                       vrna_strdup_printf("%s.sub", SEQ_ID) :
325                       vrna_strdup_printf("RNAsubopt_output.sub");
326 
327       tmp_string = vrna_filename_sanitize(v_file_name, filename_delim);
328       free(v_file_name);
329       v_file_name = tmp_string;
330 
331       if (infile && !strcmp(infile, v_file_name))
332         vrna_message_error("Input and output file names are identical");
333 
334       output = fopen((const char *)v_file_name, "a");
335       if (!output)
336         vrna_message_error("Failed to open file for writing");
337     } else {
338       output = stdout;
339     }
340 
341     /* convert DNA alphabet to RNA if not explicitely switched off */
342     if (!noconv)
343       vrna_seq_toRNA(rec_sequence);
344 
345     /* store case-unmodified sequence */
346     orig_sequence = strdup(rec_sequence);
347     /* convert sequence to uppercase letters only */
348     vrna_seq_toupper(rec_sequence);
349 
350     vrna_fold_compound_t *vc =
351       vrna_fold_compound(rec_sequence, &md,
352                          VRNA_OPTION_MFE | (circ ? 0 : VRNA_OPTION_HYBRID) |
353                          ((n_back > 0) ? VRNA_OPTION_PF : 0));
354     length = vc->length;
355 
356     structure = (char *)vrna_alloc(sizeof(char) * (length + 1));
357 
358     /* parse the rest of the current dataset to obtain a structure constraint */
359     if (fold_constrained) {
360       if (constraints_file) {
361         vrna_constraints_add(vc, constraints_file, VRNA_OPTION_DEFAULT);
362       } else {
363         cstruc = NULL;
364         int           cp        = -1;
365         unsigned int  coptions  = (maybe_multiline) ? VRNA_OPTION_MULTILINE : 0;
366         cstruc  = vrna_extract_record_rest_structure((const char **)rec_rest, 0, coptions);
367         cstruc  = vrna_cut_point_remove(cstruc, &cp);
368         if (vc->cutpoint != cp) {
369           vrna_message_error("Sequence and Structure have different cut points.\n"
370                              "sequence: %d, structure: %d",
371                              vc->cutpoint, cp);
372         }
373 
374         cl = (cstruc) ? (int)strlen(cstruc) : 0;
375 
376         if (cl == 0)
377           vrna_message_warning("Structure constraint is missing");
378         else if (cl < length)
379           vrna_message_warning("Structure constraint is shorter than sequence");
380         else if (cl > length)
381           vrna_message_error("Structure constraint is too long");
382 
383         if (cstruc) {
384           /* convert pseudo-dot-bracket to actual hard constraints */
385           unsigned int constraint_options = VRNA_CONSTRAINT_DB_DEFAULT;
386 
387           if (enforceConstraints)
388             constraint_options |= VRNA_CONSTRAINT_DB_ENFORCE_BP;
389 
390           if (canonicalBPonly)
391             constraint_options |= VRNA_CONSTRAINT_DB_CANONICAL_BP;
392 
393           vrna_constraints_add(vc, (const char *)cstruc, constraint_options);
394         }
395       }
396     }
397 
398     if (with_shapes) {
399       vrna_constraints_add_SHAPE(vc,
400                                  shape_file,
401                                  shape_method,
402                                  shape_conversion,
403                                  verbose,
404                                  VRNA_OPTION_MFE | ((n_back > 0) ? VRNA_OPTION_PF : 0));
405     }
406 
407     if (commands)
408       vrna_commands_apply(vc,
409                           commands,
410                           VRNA_CMD_PARSE_HC | VRNA_CMD_PARSE_SC);
411 
412     if (istty) {
413       if (cut_point == -1) {
414         vrna_message_info(stdout, "length = %d", length);
415       } else {
416         vrna_message_info(stdout,
417                           "length1 = %d\nlength2 = %d",
418                           cut_point - 1,
419                           length - cut_point + 1);
420       }
421     }
422 
423     /*
424      ########################################################
425      # begin actual computations
426      ########################################################
427      */
428 
429     if ((logML != 0 || md.dangles == 1 || md.dangles == 3) && dos == 0)
430       if (deltap <= 0)
431         deltap = delta / 100. + 0.001;
432 
433     if (deltap > 0)
434       print_energy = deltap;
435 
436     /* stochastic backtracking */
437     if (n_back > 0) {
438       double        mfe, kT, ens_en;
439       unsigned int  options = (nonRedundant) ?
440                               VRNA_PBACKTRACK_NON_REDUNDANT :
441                               VRNA_PBACKTRACK_DEFAULT;
442 
443       if (vc->cutpoint != -1)
444         vrna_message_error("Boltzmann sampling for cofolded structures not implemented (yet)!");
445 
446       print_fasta_header(output, rec_id);
447 
448       fprintf(output, "%s\n", rec_sequence);
449 
450       mfe = vrna_mfe(vc, structure);
451       /* rescale Boltzmann factors according to predicted MFE */
452       vrna_exp_params_rescale(vc, &mfe);
453 
454       vrna_mx_mfe_free(vc);
455 
456       /* ignore return value, we are not interested in the free energy */
457       ens_en  = vrna_pf(vc, structure);
458       kT      = vc->exp_params->kT / 1000.;
459 
460       if (st_back_en) {
461         struct nr_en_data dat;
462         dat.output  = output;
463         dat.fc      = vc;
464         dat.kT      = kT;
465         dat.ens_en  = ens_en;
466 
467         vrna_pbacktrack_cb(vc,
468                            n_back,
469                            &print_samples_en,
470                            (void *)&dat,
471                            options);
472       } else {
473         vrna_pbacktrack_cb(vc,
474                            n_back,
475                            &print_samples,
476                            (void *)output,
477                            options);
478       }
479     }
480     /* normal subopt */
481     else if (!zuker) {
482       /* first lines of output (suitable  for sort +1n) */
483       if (rec_id) {
484         char *head = vrna_strdup_printf("%s [%d]", rec_id, delta);
485         print_fasta_header(output, head);
486         free(head);
487       }
488 
489       vrna_subopt(vc, delta, subopt_sorted, output);
490 
491       if (dos) {
492         int i;
493         for (i = 0; i <= MAXDOS && i <= delta / 10; i++) {
494           char *tline = vrna_strdup_printf("%4d %6d", i, density_of_states[i]);
495           print_table(output, NULL, tline);
496           free(tline);
497         }
498       }
499     }
500     /* Zuker suboptimals */
501     else {
502       vrna_subopt_solution_t  *zr;
503 
504       if (vc->cutpoint != -1)
505         vrna_message_error("Sorry, zuker subopts not yet implemented for cofold");
506 
507       int                     i;
508       print_fasta_header(output, rec_id);
509 
510       fprintf(output, "%s\n", rec_sequence);
511 
512       zr = vrna_subopt_zuker(vc);
513 
514       putoutzuker(output, zr);
515       (void)fflush(output);
516       for (i = 0; zr[i].structure; i++)
517         free(zr[i].structure);
518       free(zr);
519     }
520 
521     (void)fflush(output);
522 
523     /* clean up */
524     vrna_fold_compound_free(vc);
525 
526     free(cstruc);
527 
528     free(rec_id);
529     free(SEQ_ID);
530     free(rec_sequence);
531     free(orig_sequence);
532     free(structure);
533 
534     /* free the rest of current dataset */
535     if (rec_rest) {
536       for (i = 0; rec_rest[i]; i++)
537         free(rec_rest[i]);
538       free(rec_rest);
539     }
540 
541     rec_id    = rec_sequence = orig_sequence = structure = cstruc = NULL;
542     rec_rest  = NULL;
543 
544     if (tofile && output) {
545       fclose(output);
546       output = NULL;
547     }
548 
549     free(v_file_name);
550 
551     if (with_shapes || (constraints_file && (!batch)))
552       break;
553 
554     /* print user help for the next round if we get input from tty */
555     if (istty) {
556       if (!zuker)
557         print_comment(stdout, "Use '&' to connect 2 sequences that shall form a complex.");
558 
559       if (fold_constrained) {
560         vrna_message_constraint_options(
561           VRNA_CONSTRAINT_DB_DOT | VRNA_CONSTRAINT_DB_X | VRNA_CONSTRAINT_DB_ANG_BRACK |
562           VRNA_CONSTRAINT_DB_RND_BRACK);
563         vrna_message_input_seq(
564           "Input sequence (upper or lower case) followed by structure constraint");
565       } else {
566         vrna_message_input_seq_simple();
567       }
568     }
569   }
570 
571   if (infile && input)
572     fclose(input);
573 
574   free(shape_method);
575   free(shape_conversion);
576   free(filename_delim);
577   vrna_commands_free(commands);
578 
579   free_id_data(id_control);
580 
581   return EXIT_SUCCESS;
582 }
583 
584 
585 PRIVATE void
print_samples(const char * structure,void * data)586 print_samples(const char  *structure,
587               void        *data)
588 {
589   if (structure)
590     print_structure((FILE *)data, structure, NULL);
591 }
592 
593 
594 PRIVATE void
print_samples_en(const char * structure,void * data)595 print_samples_en(const char *structure,
596                  void       *data)
597 {
598   if (structure) {
599     struct nr_en_data     *d      = (struct nr_en_data *)data;
600     FILE                  *output = d->output;
601     vrna_fold_compound_t  *fc     = d->fc;
602     double                kT      = d->kT;
603     double                ens_en  = d->ens_en;
604 
605     double                e         = vrna_eval_structure(fc, structure);
606     double                prob      = exp((ens_en - e) / kT);
607     char                  *e_string = vrna_strdup_printf(" %6.2f %6g", e, prob);
608 
609     print_structure(output, structure, e_string);
610 
611     free(e_string);
612   }
613 }
614 
615 
616 PRIVATE void
putoutzuker(FILE * output,vrna_subopt_solution_t * zukersolution)617 putoutzuker(FILE                    *output,
618             vrna_subopt_solution_t  *zukersolution)
619 {
620   int   i;
621   char  *e_string;
622 
623   for (i = 0; zukersolution[i].structure; i++) {
624     e_string = vrna_strdup_printf(" [%6.2f]", zukersolution[i].energy);
625     print_structure(output, zukersolution[i].structure, e_string);
626     free(e_string);
627   }
628   return;
629 }
630