1 /*
2     io/file_formats.c
3 
4     Various functions dealing with file formats for RNA sequences, structures, and alignments
5 
6     (c) 2014 Ronny Lorenz
7 
8     Vienna RNA package
9 */
10 
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14 
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <math.h>
19 #include <ctype.h>
20 
21 #include "ViennaRNA/fold_vars.h"
22 #include "ViennaRNA/utils/basic.h"
23 #include "ViennaRNA/io/utils.h"
24 #include "ViennaRNA/constraints/hard.h"
25 #if VRNA_WITH_JSON_SUPPORT
26 # include <json/json.h>
27 #endif
28 #include "ViennaRNA/io/file_formats.h"
29 
30 #define DEBUG
31 /*
32 #################################
33 # PRIVATE VARIABLES             #
34 #################################
35 */
36 
37 PRIVATE char          *inbuf  = NULL;
38 PRIVATE char          *inbuf2 = NULL;
39 PRIVATE unsigned int  typebuf = 0;
40 
41 /*
42 #################################
43 # PRIVATE FUNCTION DECLARATIONS #
44 #################################
45 */
46 
47 PRIVATE unsigned int
48 read_multiple_input_lines(char **string, FILE *file, unsigned int option);
49 
50 PRIVATE void
51 elim_trailing_ws(char *string);
52 
53 /*
54 #################################
55 # BEGIN OF FUNCTION DEFINITIONS #
56 #################################
57 */
58 
59 /* eliminate whitespaces/non-printable characters at the end of a character string */
60 PRIVATE void
elim_trailing_ws(char * string)61 elim_trailing_ws(char *string)
62 {
63   int i, l = strlen(string);
64 
65   for(i = l-1; i >= 0; i--){
66     if (isspace(string[i]) || (!isprint(string[i])))
67       continue;
68 
69     break;
70   }
71   string[(i >= 0) ? (i+1) : 0] = '\0';
72 }
73 
74 PUBLIC void
vrna_file_helixlist(const char * seq,const char * db,float energy,FILE * file)75 vrna_file_helixlist(const char *seq,
76                     const char *db,
77                     float energy,
78                     FILE *file){
79 
80   int         s;
81   short       *pt;
82   vrna_hx_t   *list;
83   FILE *out;
84 
85   if(strlen(seq) != strlen(db)) {
86     vrna_message_warning("vrna_file_helixlist: "
87                          "sequence and structure have unequal length (%d vs. %d)!",
88                          strlen(seq),
89                          strlen(db));
90     return;
91   }
92   out   = (file) ? file : stdout;
93   pt    = vrna_ptable(db);
94   list  = vrna_hx_from_ptable(pt);
95 
96   fprintf(out, "%s\t%6.2f\n", seq, energy);
97   for(s = 0; list[s].length > 0; s++){
98     fprintf(out, "%d\t%d\t%d\n", list[s].start, list[s].end, list[s].length);
99   }
100 
101   free(pt);
102   free(list);
103 }
104 
105 PUBLIC void
vrna_file_connect(const char * seq,const char * db,float energy,const char * identifier,FILE * file)106 vrna_file_connect(const char *seq,
107                   const char *db,
108                   float energy,
109                   const char *identifier,
110                   FILE *file){
111 
112   int i, power_d;
113   FILE *out = (file) ? file : stdout;
114 
115   if(strlen(seq) != strlen(db)) {
116     vrna_message_warning("vrna_file_connect: "
117                          "sequence and structure have unequal length (%d vs. %d)!",
118                          strlen(seq),
119                          strlen(db));
120     return;
121   }
122 
123   short *pt = vrna_ptable(db);
124 
125   for(power_d=0;pow(10,power_d) <= (int)strlen(seq);power_d++);
126 
127   /*
128     Connect table file format looks like this:
129 
130     300  ENERGY = 7.0  example
131       1 G       0    2   22    1
132       2 G       1    3   21    2
133 
134     where the headerline is followed by 6 columns with:
135     1. Base number: index n
136     2. Base (A, C, G, T, U, X)
137     3. Index n-1  (0 if first nucleotide)
138     4. Index n+1  (0 if last nucleotide)
139     5. Number of the base to which n is paired. No pairing is indicated by 0 (zero).
140     6. Natural numbering.
141   */
142 
143   /* print header */
144   fprintf(out, "%d  ENERGY = %6.2f", (int)strlen(seq), energy);
145   if(identifier)
146     fprintf(out, "  %s\n", identifier);
147 
148   /* print structure information except for last line */
149   /* TODO: modify the structure information for cofold */
150   for(i = 0; i < strlen(seq) - 1; i++){
151     fprintf(out, "%*d %c %*d %*d %*d %*d\n",
152                   power_d, i+1,           /* nucleotide index */
153                   (char)toupper(seq[i]),  /* nucleotide char */
154                   power_d, i,             /* nucleotide predecessor index */
155                   power_d, i+2,           /* nucleotide successor index */
156                   power_d, pt[i+1],       /* pairing partner index */
157                   power_d, i+1);          /* nucleotide natural numbering */
158   }
159   /* print last line */
160   fprintf(out, "%*d %c %*d %*d %*d %*d\n",
161                 power_d, i+1,
162                 (char)toupper(seq[i]),
163                 power_d, i,
164                 power_d, 0,
165                 power_d, pt[i+1],
166                 power_d, i+1);
167 
168   /* clean up */
169   free(pt);
170   fflush(out);
171 }
172 
173 PUBLIC void
vrna_file_bpseq(const char * seq,const char * db,FILE * file)174 vrna_file_bpseq(const char *seq,
175                 const char *db,
176                 FILE *file){
177 
178   int i;
179   FILE *out = (file) ? file : stdout;
180 
181   if(strlen(seq) != strlen(db)) {
182     vrna_message_warning("vrna_file_bpseq: "
183                          "sequence and structure have unequal length (%d vs. %d)!",
184                          strlen(seq),
185                          strlen(db));
186     return;
187   }
188 
189   short *pt = vrna_ptable(db);
190 
191   for(i = 1; i <= pt[0]; i++){
192     fprintf(out, "%d %c %d\n", i, (char)toupper(seq[i-1]), pt[i]);
193   }
194 
195   /* clean up */
196   free(pt);
197   fflush(out);
198 }
199 
200 #if VRNA_WITH_JSON_SUPPORT
201 
202 PUBLIC void
vrna_file_json(const char * seq,const char * db,double energy,const char * identifier,FILE * file)203 vrna_file_json( const char *seq,
204                 const char *db,
205                 double energy,
206                 const char *identifier,
207                 FILE *file){
208 
209   FILE *out = (file) ? file : stdout;
210 
211   JsonNode *data  = json_mkobject();
212   JsonNode *value;
213 
214   if(identifier){
215     value = json_mkstring(identifier);
216     json_append_member(data, "id", value);
217   }
218 
219   value = json_mkstring(seq);
220   json_append_member(data, "sequence", value);
221 
222   value = json_mknumber(energy);
223   json_append_member(data, "mfe", value);
224 
225   value = json_mkstring(db);
226   json_append_member(data, "structure", value);
227 
228 
229   fprintf(out, "%s\n", json_stringify(data, "\t"));
230 
231   fflush(out);
232 }
233 
234 #endif
235 
236 PRIVATE  unsigned int
read_multiple_input_lines(char ** string,FILE * file,unsigned int option)237 read_multiple_input_lines(char **string,
238                           FILE *file,
239                           unsigned int option){
240 
241   char  *line;
242   int   i, l;
243   int   state = 0;
244   int   str_length = 0;
245   FILE  *in = (file) ? file : stdin;
246 
247   line = (inbuf2) ? inbuf2 : vrna_read_line(in);
248   inbuf2 = NULL;
249   do{
250 
251     /*
252     * read lines until informative data appears or
253     * report an error if anything goes wrong
254     */
255     if(!line) return VRNA_INPUT_ERROR;
256 
257     l = (int)strlen(line);
258 
259     /* eliminate whitespaces at the end of the line read */
260     if(!(option & VRNA_INPUT_NO_TRUNCATION))
261       elim_trailing_ws(line);
262 
263     l           = (int)strlen(line);
264     str_length  = (*string) ? (int) strlen(*string) : 0;
265 
266     switch(*line){
267       case  '@':    /* user abort */
268                     if(state) inbuf2 = line;
269                     else      free(line);
270                     return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_QUIT;
271 
272       case  '\0':   /* empty line */
273                     if(option & VRNA_INPUT_NOSKIP_BLANK_LINES){
274                       if(state) inbuf2 = line;
275                       else      free(line);
276                       return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_BLANK_LINE;
277                     }
278                     break;
279 
280       case  '#': case  '%': case  ';': case  '/': case  '*': case ' ':
281                     /* comments */
282                     if(option & VRNA_INPUT_NOSKIP_COMMENTS){
283                       if(state) inbuf2   = line;
284                       else      *string = line;
285                       return (state == 2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_COMMENT;
286                     }
287                     break;
288 
289       case  '>':    /* fasta header */
290                     if(state) inbuf2   = line;
291                     else      *string = line;
292                     return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_FASTA_HEADER;
293 
294       case  'x': case 'e': case 'l': case '&':   /* seems to be a constraint or line starting with second sequence for dimer calculations */
295                     i = 1;
296                     /* lets see if this assumption holds for the complete line */
297                     while((line[i] == 'x') || (line[i] == 'e') || (line[i] == 'l')) i++;
298                     /* lines solely consisting of 'x's, 'e's or 'l's will be considered as structure constraint */
299 
300                     if(
301                             ((line[i]>64) && (line[i]<91))  /* A-Z */
302                         ||  ((line[i]>96) && (line[i]<123)) /* a-z */
303                       ){
304                       if(option & VRNA_INPUT_FASTA_HEADER){
305                         /* are we in structure mode? Then we remember this line for the next round */
306                         if(state == 2){ inbuf2 = line; return VRNA_INPUT_CONSTRAINT;}
307                         else{
308                           *string = (char *)vrna_realloc(*string, sizeof(char) * (str_length + l + 1));
309                           memcpy(*string + str_length,
310                                  line,
311                                  sizeof(char) * l);
312                           (*string)[str_length + l] = '\0';
313                           state = 1;
314                         }
315                         break;
316                       }
317                       /* otherwise return line read */
318                       else{ *string = line; return VRNA_INPUT_SEQUENCE;}
319                     }
320                     /* mmmh? it really seems to be a constraint */
321                     /* fallthrough */
322       case  '<': case  '.': case  '|': case  '(': case ')': case '[': case ']': case '{': case '}': case ',': case '+':
323                     /* seems to be a structure or a constraint */
324                     /* either we concatenate this line to one that we read previously */
325                     if(option & VRNA_INPUT_FASTA_HEADER){
326                       if(state == 1){
327                         inbuf2 = line;
328                         return VRNA_INPUT_SEQUENCE;
329                       }
330                       else{
331                         *string = (char *)vrna_realloc(*string, sizeof(char) * (str_length + l + 1));
332                         memcpy(*string + str_length,
333                                line,
334                                sizeof(char) * l);
335                         (*string)[str_length + l] = '\0';
336                         state = 2;
337                       }
338                     }
339                     /* or we return it as it is */
340                     else{
341                       *string = line;
342                       return VRNA_INPUT_CONSTRAINT;
343                     }
344                     break;
345       default:      if(option & VRNA_INPUT_FASTA_HEADER){
346                       /* are we already in sequence mode? */
347                       if(state == 2){
348                         inbuf2 = line;
349                         return VRNA_INPUT_CONSTRAINT;
350                       }
351                       else{
352                         *string = (char *)vrna_realloc(*string, sizeof(char) * (str_length + l + 1));
353                         memcpy(*string + str_length,
354                                line,
355                                sizeof(char) * l);
356                         (*string)[str_length + l] = '\0';
357                         state = 1;
358                       }
359                     }
360                     /* otherwise return line read */
361                     else{
362                       *string = line;
363                       return VRNA_INPUT_SEQUENCE;
364                     }
365     }
366     free(line);
367     line = vrna_read_line(in);
368   }while(line);
369 
370   return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_ERROR;
371 }
372 
373 PUBLIC  unsigned int
vrna_file_fasta_read_record(char ** header,char ** sequence,char *** rest,FILE * file,unsigned int options)374 vrna_file_fasta_read_record( char **header,
375                         char **sequence,
376                         char ***rest,
377                         FILE *file,
378                         unsigned int options){
379 
380   unsigned int  input_type, return_type, tmp_type;
381   int           rest_count;
382   char          *input_string;
383 
384   rest_count    = 0;
385   return_type   = tmp_type = 0;
386   input_string  = *header = *sequence = NULL;
387   *rest         = (char **)vrna_alloc(sizeof(char *));
388 
389   /* remove unnecessary option flags from options variable... */
390   options &= ~VRNA_INPUT_FASTA_HEADER;
391 
392   /* read first input or last buffered input */
393   if(typebuf){
394     input_type    = typebuf;
395     input_string  = inbuf;
396     typebuf       = 0;
397     inbuf         = NULL;
398   }
399   else input_type  = read_multiple_input_lines(&input_string, file, options);
400 
401   if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return input_type;
402 
403   /* skip everything until we read either a fasta header or a sequence */
404   while(input_type & (VRNA_INPUT_MISC | VRNA_INPUT_CONSTRAINT | VRNA_INPUT_BLANK_LINE)){
405     free(input_string); input_string = NULL;
406     input_type    = read_multiple_input_lines(&input_string, file, options);
407     if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return input_type;
408   }
409 
410   if(input_type & VRNA_INPUT_FASTA_HEADER){
411     return_type  |= VRNA_INPUT_FASTA_HEADER; /* remember that we've read a fasta header */
412     *header       = input_string;
413     input_string  = NULL;
414     /* get next data-block with fasta support if not explicitely forbidden by VRNA_INPUT_NO_SPAN */
415     input_type  = read_multiple_input_lines(
416                     &input_string,
417                     file,
418                     ((options & VRNA_INPUT_NO_SPAN) ? 0 : VRNA_INPUT_FASTA_HEADER) | options
419                   );
420     if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return (return_type | input_type);
421   }
422 
423   if(input_type & VRNA_INPUT_SEQUENCE){
424     return_type  |= VRNA_INPUT_SEQUENCE; /* remember that we've read a sequence */
425     *sequence     = input_string;
426     input_string  = NULL;
427   } else {
428     vrna_message_warning("vrna_file_fasta_read_record: "
429                          "sequence input missing!");
430     return VRNA_INPUT_ERROR;
431   }
432 
433   /* read the rest until we find user abort, EOF, new sequence or new fasta header */
434   if(!(options & VRNA_INPUT_NO_REST)){
435     options |= VRNA_INPUT_NOSKIP_COMMENTS; /* allow commetns to appear in rest output */
436     tmp_type = VRNA_INPUT_QUIT | VRNA_INPUT_ERROR | VRNA_INPUT_SEQUENCE | VRNA_INPUT_FASTA_HEADER;
437     if(options & VRNA_INPUT_NOSKIP_BLANK_LINES) tmp_type |= VRNA_INPUT_BLANK_LINE;
438     while(!((input_type = read_multiple_input_lines(&input_string, file, options)) & tmp_type)){
439       *rest = vrna_realloc(*rest, sizeof(char **)*(++rest_count + 1));
440       (*rest)[rest_count-1] = input_string;
441       input_string = NULL;
442     }
443     /*
444     if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return input_type;
445     */
446 
447     /*  finished reading everything...
448     *   we now put the last line into the buffer if necessary
449     *   since it should belong to the next record
450     */
451     inbuf = input_string;
452     typebuf = input_type;
453   }
454   (*rest)[rest_count] = NULL;
455   return (return_type);
456 }
457 
458 PUBLIC char *
vrna_extract_record_rest_structure(const char ** lines,unsigned int length,unsigned int options)459 vrna_extract_record_rest_structure( const char **lines,
460                                     unsigned int length,
461                                     unsigned int options){
462 
463   char *structure = NULL;
464   int r, i, l, cl, stop;
465   char *c;
466 
467   if(lines){
468     for(r = i = stop = 0; lines[i]; i++){
469       l   = (int)strlen(lines[i]);
470       c   = (char *) vrna_alloc(sizeof(char) * (l+1));
471       (void) sscanf(lines[i], "%s", c);
472       cl  = (int)strlen(c);
473 
474       /* line commented out ? */
475       if((*c == '#') || (*c == '%') || (*c == ';') || (*c == '/') || (*c == '*' || (*c == '\0'))){
476         /* skip leading comments only, i.e. do not allow comments inside the constraint */
477         if(!r)  continue;
478         else    break;
479       }
480 
481       /* append the structure part to the output */
482       r += cl+1;
483       structure = (char *)vrna_realloc(structure, r*sizeof(char));
484       strcat(structure, c);
485       free(c);
486       /* stop if the assumed structure length has been reached */
487       if((length > 0) && (r-1 == length)) break;
488       /* stop if not allowed to read from multiple lines */
489       if(!(options & VRNA_OPTION_MULTILINE)) break;
490     }
491   }
492   return structure;
493 }
494 
495 PUBLIC void
vrna_extract_record_rest_constraint(char ** cstruc,const char ** lines,unsigned int option)496 vrna_extract_record_rest_constraint(char **cstruc,
497                                     const char **lines,
498                                     unsigned int option){
499 
500   *cstruc = vrna_extract_record_rest_structure(lines, 0, option | (option & VRNA_CONSTRAINT_MULTILINE) ? VRNA_OPTION_MULTILINE : 0);
501 
502 }
503 
504 PUBLIC int
vrna_file_SHAPE_read(const char * file_name,int length,double default_value,char * sequence,double * values)505 vrna_file_SHAPE_read( const char *file_name,
506                       int length,
507                       double default_value,
508                       char *sequence,
509                       double *values){
510 
511   FILE *fp;
512   char *line;
513   int i;
514   int count = 0;
515 
516   if(!file_name)
517     return 0;
518 
519   if(!(fp = fopen(file_name, "r"))){
520     vrna_message_warning("SHAPE data file could not be opened");
521     return 0;
522   }
523 
524   for (i = 0; i < length; ++i)
525   {
526     sequence[i] = 'N';
527     values[i + 1] = default_value;
528   }
529 
530   sequence[length] = '\0';
531 
532   while((line=vrna_read_line(fp))){
533     int position;
534     unsigned char nucleotide = 'N';
535     double reactivity = default_value;
536     char *second_entry = 0;
537     char *third_entry = 0;
538     char *c;
539 
540     if(sscanf(line, "%d", &position) != 1)
541     {
542       free(line);
543       continue;
544     }
545 
546     if(position <= 0 || position > length)
547     {
548       vrna_message_warning("Provided SHAPE data outside of sequence scope");
549       fclose(fp);
550       free(line);
551       return 0;
552     }
553 
554     for(c = line + 1; *c; ++c){
555       if(isspace(*(c-1)) && !isspace(*c)) {
556         if(!second_entry){
557           second_entry = c;
558         }else{
559           third_entry = c;
560           break;
561         }
562       }
563     }
564 
565     if(second_entry){
566       if(third_entry){
567         sscanf(second_entry, "%c", &nucleotide);
568         sscanf(third_entry, "%lf", &reactivity);
569       }else if(sscanf(second_entry, "%lf", &reactivity) != 1)
570         sscanf(second_entry, "%c", &nucleotide);
571     }
572 
573     sequence[position-1] = nucleotide;
574     values[position] = reactivity;
575     ++count;
576 
577     free(line);
578   }
579 
580   fclose(fp);
581 
582   if(!count)
583   {
584       vrna_message_warning("SHAPE data file is empty");
585       return 0;
586   }
587 
588   return 1;
589 }
590 
591 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
592 
593 /*###########################################*/
594 /*# deprecated functions below              #*/
595 /*###########################################*/
596 
597 PUBLIC unsigned int
get_multi_input_line(char ** string,unsigned int option)598 get_multi_input_line( char **string,
599                       unsigned int option){
600 
601   return read_multiple_input_lines(string, NULL, option);
602 }
603 
604 PUBLIC unsigned int
read_record(char ** header,char ** sequence,char *** rest,unsigned int options)605 read_record(char **header,
606             char **sequence,
607             char  ***rest,
608             unsigned int options){
609 
610   return vrna_file_fasta_read_record(header, sequence, rest, NULL, options);
611 }
612 
613 PUBLIC char *
extract_record_rest_structure(const char ** lines,unsigned int length,unsigned int options)614 extract_record_rest_structure(const char **lines,
615                               unsigned int length,
616                               unsigned int options){
617 
618   return vrna_extract_record_rest_structure(lines, length, options);
619 }
620 
621 
622 #endif
623