1 /*
2 io/file_formats.c
3
4 Various functions dealing with file formats for RNA sequences, structures, and alignments
5
6 (c) 2014 Ronny Lorenz
7
8 Vienna RNA package
9 */
10
11 #ifdef HAVE_CONFIG_H
12 #include "config.h"
13 #endif
14
15 #include <stdio.h>
16 #include <stdlib.h>
17 #include <string.h>
18 #include <math.h>
19 #include <ctype.h>
20
21 #include "ViennaRNA/fold_vars.h"
22 #include "ViennaRNA/utils/basic.h"
23 #include "ViennaRNA/io/utils.h"
24 #include "ViennaRNA/constraints/hard.h"
25 #if VRNA_WITH_JSON_SUPPORT
26 # include <json/json.h>
27 #endif
28 #include "ViennaRNA/io/file_formats.h"
29
30 #define DEBUG
31 /*
32 #################################
33 # PRIVATE VARIABLES #
34 #################################
35 */
36
37 PRIVATE char *inbuf = NULL;
38 PRIVATE char *inbuf2 = NULL;
39 PRIVATE unsigned int typebuf = 0;
40
41 /*
42 #################################
43 # PRIVATE FUNCTION DECLARATIONS #
44 #################################
45 */
46
47 PRIVATE unsigned int
48 read_multiple_input_lines(char **string, FILE *file, unsigned int option);
49
50 PRIVATE void
51 elim_trailing_ws(char *string);
52
53 /*
54 #################################
55 # BEGIN OF FUNCTION DEFINITIONS #
56 #################################
57 */
58
59 /* eliminate whitespaces/non-printable characters at the end of a character string */
60 PRIVATE void
elim_trailing_ws(char * string)61 elim_trailing_ws(char *string)
62 {
63 int i, l = strlen(string);
64
65 for(i = l-1; i >= 0; i--){
66 if (isspace(string[i]) || (!isprint(string[i])))
67 continue;
68
69 break;
70 }
71 string[(i >= 0) ? (i+1) : 0] = '\0';
72 }
73
74 PUBLIC void
vrna_file_helixlist(const char * seq,const char * db,float energy,FILE * file)75 vrna_file_helixlist(const char *seq,
76 const char *db,
77 float energy,
78 FILE *file){
79
80 int s;
81 short *pt;
82 vrna_hx_t *list;
83 FILE *out;
84
85 if(strlen(seq) != strlen(db)) {
86 vrna_message_warning("vrna_file_helixlist: "
87 "sequence and structure have unequal length (%d vs. %d)!",
88 strlen(seq),
89 strlen(db));
90 return;
91 }
92 out = (file) ? file : stdout;
93 pt = vrna_ptable(db);
94 list = vrna_hx_from_ptable(pt);
95
96 fprintf(out, "%s\t%6.2f\n", seq, energy);
97 for(s = 0; list[s].length > 0; s++){
98 fprintf(out, "%d\t%d\t%d\n", list[s].start, list[s].end, list[s].length);
99 }
100
101 free(pt);
102 free(list);
103 }
104
105 PUBLIC void
vrna_file_connect(const char * seq,const char * db,float energy,const char * identifier,FILE * file)106 vrna_file_connect(const char *seq,
107 const char *db,
108 float energy,
109 const char *identifier,
110 FILE *file){
111
112 int i, power_d;
113 FILE *out = (file) ? file : stdout;
114
115 if(strlen(seq) != strlen(db)) {
116 vrna_message_warning("vrna_file_connect: "
117 "sequence and structure have unequal length (%d vs. %d)!",
118 strlen(seq),
119 strlen(db));
120 return;
121 }
122
123 short *pt = vrna_ptable(db);
124
125 for(power_d=0;pow(10,power_d) <= (int)strlen(seq);power_d++);
126
127 /*
128 Connect table file format looks like this:
129
130 300 ENERGY = 7.0 example
131 1 G 0 2 22 1
132 2 G 1 3 21 2
133
134 where the headerline is followed by 6 columns with:
135 1. Base number: index n
136 2. Base (A, C, G, T, U, X)
137 3. Index n-1 (0 if first nucleotide)
138 4. Index n+1 (0 if last nucleotide)
139 5. Number of the base to which n is paired. No pairing is indicated by 0 (zero).
140 6. Natural numbering.
141 */
142
143 /* print header */
144 fprintf(out, "%d ENERGY = %6.2f", (int)strlen(seq), energy);
145 if(identifier)
146 fprintf(out, " %s\n", identifier);
147
148 /* print structure information except for last line */
149 /* TODO: modify the structure information for cofold */
150 for(i = 0; i < strlen(seq) - 1; i++){
151 fprintf(out, "%*d %c %*d %*d %*d %*d\n",
152 power_d, i+1, /* nucleotide index */
153 (char)toupper(seq[i]), /* nucleotide char */
154 power_d, i, /* nucleotide predecessor index */
155 power_d, i+2, /* nucleotide successor index */
156 power_d, pt[i+1], /* pairing partner index */
157 power_d, i+1); /* nucleotide natural numbering */
158 }
159 /* print last line */
160 fprintf(out, "%*d %c %*d %*d %*d %*d\n",
161 power_d, i+1,
162 (char)toupper(seq[i]),
163 power_d, i,
164 power_d, 0,
165 power_d, pt[i+1],
166 power_d, i+1);
167
168 /* clean up */
169 free(pt);
170 fflush(out);
171 }
172
173 PUBLIC void
vrna_file_bpseq(const char * seq,const char * db,FILE * file)174 vrna_file_bpseq(const char *seq,
175 const char *db,
176 FILE *file){
177
178 int i;
179 FILE *out = (file) ? file : stdout;
180
181 if(strlen(seq) != strlen(db)) {
182 vrna_message_warning("vrna_file_bpseq: "
183 "sequence and structure have unequal length (%d vs. %d)!",
184 strlen(seq),
185 strlen(db));
186 return;
187 }
188
189 short *pt = vrna_ptable(db);
190
191 for(i = 1; i <= pt[0]; i++){
192 fprintf(out, "%d %c %d\n", i, (char)toupper(seq[i-1]), pt[i]);
193 }
194
195 /* clean up */
196 free(pt);
197 fflush(out);
198 }
199
200 #if VRNA_WITH_JSON_SUPPORT
201
202 PUBLIC void
vrna_file_json(const char * seq,const char * db,double energy,const char * identifier,FILE * file)203 vrna_file_json( const char *seq,
204 const char *db,
205 double energy,
206 const char *identifier,
207 FILE *file){
208
209 FILE *out = (file) ? file : stdout;
210
211 JsonNode *data = json_mkobject();
212 JsonNode *value;
213
214 if(identifier){
215 value = json_mkstring(identifier);
216 json_append_member(data, "id", value);
217 }
218
219 value = json_mkstring(seq);
220 json_append_member(data, "sequence", value);
221
222 value = json_mknumber(energy);
223 json_append_member(data, "mfe", value);
224
225 value = json_mkstring(db);
226 json_append_member(data, "structure", value);
227
228
229 fprintf(out, "%s\n", json_stringify(data, "\t"));
230
231 fflush(out);
232 }
233
234 #endif
235
236 PRIVATE unsigned int
read_multiple_input_lines(char ** string,FILE * file,unsigned int option)237 read_multiple_input_lines(char **string,
238 FILE *file,
239 unsigned int option){
240
241 char *line;
242 int i, l;
243 int state = 0;
244 int str_length = 0;
245 FILE *in = (file) ? file : stdin;
246
247 line = (inbuf2) ? inbuf2 : vrna_read_line(in);
248 inbuf2 = NULL;
249 do{
250
251 /*
252 * read lines until informative data appears or
253 * report an error if anything goes wrong
254 */
255 if(!line) return VRNA_INPUT_ERROR;
256
257 l = (int)strlen(line);
258
259 /* eliminate whitespaces at the end of the line read */
260 if(!(option & VRNA_INPUT_NO_TRUNCATION))
261 elim_trailing_ws(line);
262
263 l = (int)strlen(line);
264 str_length = (*string) ? (int) strlen(*string) : 0;
265
266 switch(*line){
267 case '@': /* user abort */
268 if(state) inbuf2 = line;
269 else free(line);
270 return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_QUIT;
271
272 case '\0': /* empty line */
273 if(option & VRNA_INPUT_NOSKIP_BLANK_LINES){
274 if(state) inbuf2 = line;
275 else free(line);
276 return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_BLANK_LINE;
277 }
278 break;
279
280 case '#': case '%': case ';': case '/': case '*': case ' ':
281 /* comments */
282 if(option & VRNA_INPUT_NOSKIP_COMMENTS){
283 if(state) inbuf2 = line;
284 else *string = line;
285 return (state == 2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_COMMENT;
286 }
287 break;
288
289 case '>': /* fasta header */
290 if(state) inbuf2 = line;
291 else *string = line;
292 return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_FASTA_HEADER;
293
294 case 'x': case 'e': case 'l': case '&': /* seems to be a constraint or line starting with second sequence for dimer calculations */
295 i = 1;
296 /* lets see if this assumption holds for the complete line */
297 while((line[i] == 'x') || (line[i] == 'e') || (line[i] == 'l')) i++;
298 /* lines solely consisting of 'x's, 'e's or 'l's will be considered as structure constraint */
299
300 if(
301 ((line[i]>64) && (line[i]<91)) /* A-Z */
302 || ((line[i]>96) && (line[i]<123)) /* a-z */
303 ){
304 if(option & VRNA_INPUT_FASTA_HEADER){
305 /* are we in structure mode? Then we remember this line for the next round */
306 if(state == 2){ inbuf2 = line; return VRNA_INPUT_CONSTRAINT;}
307 else{
308 *string = (char *)vrna_realloc(*string, sizeof(char) * (str_length + l + 1));
309 memcpy(*string + str_length,
310 line,
311 sizeof(char) * l);
312 (*string)[str_length + l] = '\0';
313 state = 1;
314 }
315 break;
316 }
317 /* otherwise return line read */
318 else{ *string = line; return VRNA_INPUT_SEQUENCE;}
319 }
320 /* mmmh? it really seems to be a constraint */
321 /* fallthrough */
322 case '<': case '.': case '|': case '(': case ')': case '[': case ']': case '{': case '}': case ',': case '+':
323 /* seems to be a structure or a constraint */
324 /* either we concatenate this line to one that we read previously */
325 if(option & VRNA_INPUT_FASTA_HEADER){
326 if(state == 1){
327 inbuf2 = line;
328 return VRNA_INPUT_SEQUENCE;
329 }
330 else{
331 *string = (char *)vrna_realloc(*string, sizeof(char) * (str_length + l + 1));
332 memcpy(*string + str_length,
333 line,
334 sizeof(char) * l);
335 (*string)[str_length + l] = '\0';
336 state = 2;
337 }
338 }
339 /* or we return it as it is */
340 else{
341 *string = line;
342 return VRNA_INPUT_CONSTRAINT;
343 }
344 break;
345 default: if(option & VRNA_INPUT_FASTA_HEADER){
346 /* are we already in sequence mode? */
347 if(state == 2){
348 inbuf2 = line;
349 return VRNA_INPUT_CONSTRAINT;
350 }
351 else{
352 *string = (char *)vrna_realloc(*string, sizeof(char) * (str_length + l + 1));
353 memcpy(*string + str_length,
354 line,
355 sizeof(char) * l);
356 (*string)[str_length + l] = '\0';
357 state = 1;
358 }
359 }
360 /* otherwise return line read */
361 else{
362 *string = line;
363 return VRNA_INPUT_SEQUENCE;
364 }
365 }
366 free(line);
367 line = vrna_read_line(in);
368 }while(line);
369
370 return (state==2) ? VRNA_INPUT_CONSTRAINT : (state==1) ? VRNA_INPUT_SEQUENCE : VRNA_INPUT_ERROR;
371 }
372
373 PUBLIC unsigned int
vrna_file_fasta_read_record(char ** header,char ** sequence,char *** rest,FILE * file,unsigned int options)374 vrna_file_fasta_read_record( char **header,
375 char **sequence,
376 char ***rest,
377 FILE *file,
378 unsigned int options){
379
380 unsigned int input_type, return_type, tmp_type;
381 int rest_count;
382 char *input_string;
383
384 rest_count = 0;
385 return_type = tmp_type = 0;
386 input_string = *header = *sequence = NULL;
387 *rest = (char **)vrna_alloc(sizeof(char *));
388
389 /* remove unnecessary option flags from options variable... */
390 options &= ~VRNA_INPUT_FASTA_HEADER;
391
392 /* read first input or last buffered input */
393 if(typebuf){
394 input_type = typebuf;
395 input_string = inbuf;
396 typebuf = 0;
397 inbuf = NULL;
398 }
399 else input_type = read_multiple_input_lines(&input_string, file, options);
400
401 if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return input_type;
402
403 /* skip everything until we read either a fasta header or a sequence */
404 while(input_type & (VRNA_INPUT_MISC | VRNA_INPUT_CONSTRAINT | VRNA_INPUT_BLANK_LINE)){
405 free(input_string); input_string = NULL;
406 input_type = read_multiple_input_lines(&input_string, file, options);
407 if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return input_type;
408 }
409
410 if(input_type & VRNA_INPUT_FASTA_HEADER){
411 return_type |= VRNA_INPUT_FASTA_HEADER; /* remember that we've read a fasta header */
412 *header = input_string;
413 input_string = NULL;
414 /* get next data-block with fasta support if not explicitely forbidden by VRNA_INPUT_NO_SPAN */
415 input_type = read_multiple_input_lines(
416 &input_string,
417 file,
418 ((options & VRNA_INPUT_NO_SPAN) ? 0 : VRNA_INPUT_FASTA_HEADER) | options
419 );
420 if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return (return_type | input_type);
421 }
422
423 if(input_type & VRNA_INPUT_SEQUENCE){
424 return_type |= VRNA_INPUT_SEQUENCE; /* remember that we've read a sequence */
425 *sequence = input_string;
426 input_string = NULL;
427 } else {
428 vrna_message_warning("vrna_file_fasta_read_record: "
429 "sequence input missing!");
430 return VRNA_INPUT_ERROR;
431 }
432
433 /* read the rest until we find user abort, EOF, new sequence or new fasta header */
434 if(!(options & VRNA_INPUT_NO_REST)){
435 options |= VRNA_INPUT_NOSKIP_COMMENTS; /* allow commetns to appear in rest output */
436 tmp_type = VRNA_INPUT_QUIT | VRNA_INPUT_ERROR | VRNA_INPUT_SEQUENCE | VRNA_INPUT_FASTA_HEADER;
437 if(options & VRNA_INPUT_NOSKIP_BLANK_LINES) tmp_type |= VRNA_INPUT_BLANK_LINE;
438 while(!((input_type = read_multiple_input_lines(&input_string, file, options)) & tmp_type)){
439 *rest = vrna_realloc(*rest, sizeof(char **)*(++rest_count + 1));
440 (*rest)[rest_count-1] = input_string;
441 input_string = NULL;
442 }
443 /*
444 if(input_type & (VRNA_INPUT_QUIT | VRNA_INPUT_ERROR)) return input_type;
445 */
446
447 /* finished reading everything...
448 * we now put the last line into the buffer if necessary
449 * since it should belong to the next record
450 */
451 inbuf = input_string;
452 typebuf = input_type;
453 }
454 (*rest)[rest_count] = NULL;
455 return (return_type);
456 }
457
458 PUBLIC char *
vrna_extract_record_rest_structure(const char ** lines,unsigned int length,unsigned int options)459 vrna_extract_record_rest_structure( const char **lines,
460 unsigned int length,
461 unsigned int options){
462
463 char *structure = NULL;
464 int r, i, l, cl, stop;
465 char *c;
466
467 if(lines){
468 for(r = i = stop = 0; lines[i]; i++){
469 l = (int)strlen(lines[i]);
470 c = (char *) vrna_alloc(sizeof(char) * (l+1));
471 (void) sscanf(lines[i], "%s", c);
472 cl = (int)strlen(c);
473
474 /* line commented out ? */
475 if((*c == '#') || (*c == '%') || (*c == ';') || (*c == '/') || (*c == '*' || (*c == '\0'))){
476 /* skip leading comments only, i.e. do not allow comments inside the constraint */
477 if(!r) continue;
478 else break;
479 }
480
481 /* append the structure part to the output */
482 r += cl+1;
483 structure = (char *)vrna_realloc(structure, r*sizeof(char));
484 strcat(structure, c);
485 free(c);
486 /* stop if the assumed structure length has been reached */
487 if((length > 0) && (r-1 == length)) break;
488 /* stop if not allowed to read from multiple lines */
489 if(!(options & VRNA_OPTION_MULTILINE)) break;
490 }
491 }
492 return structure;
493 }
494
495 PUBLIC void
vrna_extract_record_rest_constraint(char ** cstruc,const char ** lines,unsigned int option)496 vrna_extract_record_rest_constraint(char **cstruc,
497 const char **lines,
498 unsigned int option){
499
500 *cstruc = vrna_extract_record_rest_structure(lines, 0, option | (option & VRNA_CONSTRAINT_MULTILINE) ? VRNA_OPTION_MULTILINE : 0);
501
502 }
503
504 PUBLIC int
vrna_file_SHAPE_read(const char * file_name,int length,double default_value,char * sequence,double * values)505 vrna_file_SHAPE_read( const char *file_name,
506 int length,
507 double default_value,
508 char *sequence,
509 double *values){
510
511 FILE *fp;
512 char *line;
513 int i;
514 int count = 0;
515
516 if(!file_name)
517 return 0;
518
519 if(!(fp = fopen(file_name, "r"))){
520 vrna_message_warning("SHAPE data file could not be opened");
521 return 0;
522 }
523
524 for (i = 0; i < length; ++i)
525 {
526 sequence[i] = 'N';
527 values[i + 1] = default_value;
528 }
529
530 sequence[length] = '\0';
531
532 while((line=vrna_read_line(fp))){
533 int position;
534 unsigned char nucleotide = 'N';
535 double reactivity = default_value;
536 char *second_entry = 0;
537 char *third_entry = 0;
538 char *c;
539
540 if(sscanf(line, "%d", &position) != 1)
541 {
542 free(line);
543 continue;
544 }
545
546 if(position <= 0 || position > length)
547 {
548 vrna_message_warning("Provided SHAPE data outside of sequence scope");
549 fclose(fp);
550 free(line);
551 return 0;
552 }
553
554 for(c = line + 1; *c; ++c){
555 if(isspace(*(c-1)) && !isspace(*c)) {
556 if(!second_entry){
557 second_entry = c;
558 }else{
559 third_entry = c;
560 break;
561 }
562 }
563 }
564
565 if(second_entry){
566 if(third_entry){
567 sscanf(second_entry, "%c", &nucleotide);
568 sscanf(third_entry, "%lf", &reactivity);
569 }else if(sscanf(second_entry, "%lf", &reactivity) != 1)
570 sscanf(second_entry, "%c", &nucleotide);
571 }
572
573 sequence[position-1] = nucleotide;
574 values[position] = reactivity;
575 ++count;
576
577 free(line);
578 }
579
580 fclose(fp);
581
582 if(!count)
583 {
584 vrna_message_warning("SHAPE data file is empty");
585 return 0;
586 }
587
588 return 1;
589 }
590
591 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
592
593 /*###########################################*/
594 /*# deprecated functions below #*/
595 /*###########################################*/
596
597 PUBLIC unsigned int
get_multi_input_line(char ** string,unsigned int option)598 get_multi_input_line( char **string,
599 unsigned int option){
600
601 return read_multiple_input_lines(string, NULL, option);
602 }
603
604 PUBLIC unsigned int
read_record(char ** header,char ** sequence,char *** rest,unsigned int options)605 read_record(char **header,
606 char **sequence,
607 char ***rest,
608 unsigned int options){
609
610 return vrna_file_fasta_read_record(header, sequence, rest, NULL, options);
611 }
612
613 PUBLIC char *
extract_record_rest_structure(const char ** lines,unsigned int length,unsigned int options)614 extract_record_rest_structure(const char **lines,
615 unsigned int length,
616 unsigned int options){
617
618 return vrna_extract_record_rest_structure(lines, length, options);
619 }
620
621
622 #endif
623