1 /*
2  *  sequence.c
3  *
4  *  Code for handling nucleotide sequences
5  *
6  *  Part of the ViennaRNA Package
7  *  (c) 2016 Ronny Lorenz
8  */
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13 
14 #include "ViennaRNA/datastructures/basic.h"
15 #include "ViennaRNA/utils/basic.h"
16 #include "ViennaRNA/utils/strings.h"
17 #include "ViennaRNA/alphabet.h"
18 #include "ViennaRNA/sequence.h"
19 
20 /*
21  #################################
22  # PRIVATE FUNCTION DECLARATIONS #
23  #################################
24  */
25 PRIVATE void
26 set_sequence(vrna_seq_t   *obj,
27              const char   *string,
28              const char   *name,
29              vrna_md_t    *md,
30              unsigned int options);
31 
32 
33 PRIVATE void
34 free_sequence_data(vrna_seq_t *obj);
35 
36 
37 /*
38  #################################
39  # BEGIN OF FUNCTION DEFINITIONS #
40  #################################
41  */
42 PUBLIC vrna_seq_t *
vrna_sequence(const char * string,unsigned int options)43 vrna_sequence(const char    *string,
44               unsigned int  options)
45 {
46   vrna_seq_t *data = NULL;
47 
48   if (string) {
49     data = (vrna_seq_t *)vrna_alloc(sizeof(vrna_seq_t));
50     set_sequence(data, string, NULL, NULL, options);
51   }
52 
53   return data;
54 }
55 
56 
57 PUBLIC int
vrna_sequence_add(vrna_fold_compound_t * vc,const char * string,unsigned int options)58 vrna_sequence_add(vrna_fold_compound_t  *vc,
59                   const char            *string,
60                   unsigned int          options)
61 {
62   unsigned int  add_length;
63   int           ret = 0;
64 
65   if ((vc) && (vc->type == VRNA_FC_TYPE_SINGLE) && (string)) {
66     add_length = strlen(string);
67 
68     /* add the sequence to the nucleotides container */
69     vc->nucleotides = (vrna_seq_t *)vrna_realloc(vc->nucleotides,
70                                                  sizeof(vrna_seq_t) *
71                                                  (vc->strands + 1));
72     set_sequence(&(vc->nucleotides[vc->strands]),
73                  string,
74                  NULL,
75                  &(vc->params->model_details),
76                  options);
77 
78     /* increase strands counter */
79     vc->strands++;
80 
81     /* add new sequence to initial order of all strands */
82     vc->sequence = (char *)vrna_realloc(vc->sequence,
83                                         sizeof(char) *
84                                         (vc->length + add_length + 1));
85     memcpy(vc->sequence + vc->length,
86            vc->nucleotides[vc->strands - 1].string,
87            add_length * sizeof(char));
88     vc->sequence[vc->length + add_length] = '\0';
89 
90     /* add encoding for new strand */
91     vc->sequence_encoding = (short *)vrna_realloc(vc->sequence_encoding,
92                                                   sizeof(short) *
93                                                   (vc->length + add_length + 2));
94 
95     memcpy(vc->sequence_encoding + vc->length + 1,
96            vc->nucleotides[vc->strands - 1].encoding + 1,
97            add_length * sizeof(short));
98 
99     /* restore circular encoding */
100     vc->sequence_encoding[vc->length + add_length + 1]  = vc->sequence_encoding[1];
101     vc->sequence_encoding[0]                            =
102       vc->sequence_encoding[vc->length + add_length];
103 
104     /* add encoding2 (simple encoding) for new strand */
105     vc->sequence_encoding2 = (short *)vrna_realloc(vc->sequence_encoding2,
106                                                    sizeof(short) *
107                                                    (vc->length + add_length + 2));
108     short *enc = vrna_seq_encode_simple(vc->nucleotides[vc->strands - 1].string,
109                                         &(vc->params->model_details));
110     memcpy(vc->sequence_encoding2 + vc->length + 1,
111            enc + 1,
112            add_length * sizeof(short));
113     free(enc);
114     vc->sequence_encoding2[vc->length + add_length + 1] = vc->sequence_encoding2[1];
115     vc->sequence_encoding2[0]                           = (short)(vc->length + add_length);
116 
117     /* finally, increase length property of the fold compound */
118     vc->length = vc->length + add_length;
119 
120     ret = 1;
121   }
122 
123   return ret;
124 }
125 
126 
127 PUBLIC int
vrna_msa_add(vrna_fold_compound_t * fc,const char ** alignment,const char ** names,const unsigned char * orientation,const unsigned long long * start,const unsigned long long * genome_size,unsigned int options)128 vrna_msa_add(vrna_fold_compound_t     *fc,
129              const char               **alignment,
130              const char               **names,
131              const unsigned char      *orientation,
132              const unsigned long long *start,
133              const unsigned long long *genome_size,
134              unsigned int             options)
135 {
136   int ret;
137 
138   ret = 0;
139 
140   if ((fc) &&
141       (fc->type == VRNA_FC_TYPE_COMPARATIVE) &&
142       (alignment)) {
143     size_t      s, ss, cnt, num_names, num_orientations, num_starts, num_genome_sizes;
144     vrna_msa_t  *msa;
145 
146     num_names = num_orientations = num_starts = num_genome_sizes = 0;
147 
148     /* add the sequence to the nucleotides container */
149     fc->alignment = (vrna_msa_t *)vrna_realloc(fc->alignment,
150                                                sizeof(vrna_msa_t) *
151                                                (fc->strands + 1));
152 
153     /* count number of sequences in alignment */
154     for (s = 0; alignment[s]; s++);
155 
156     msa = &(fc->alignment[fc->strands]);
157 
158     msa->n_seq        = s;
159     msa->sequences    = (vrna_seq_t *)vrna_alloc(sizeof(vrna_seq_t) * s);
160     msa->orientation  = NULL;
161     msa->start        = NULL;
162     msa->genome_size  = NULL;
163     msa->a2s          = NULL; /* alignment column to nt number mapping */
164     msa->gapfree_seq  = NULL; /* gap-free sequence */
165     msa->gapfree_size = NULL; /* gap-free sequence length */
166 
167     if (names) {
168       for (s = 0; s < msa->n_seq; s++) {
169         if (!names[s])
170           break;
171 
172         num_names++;
173       }
174 
175       if (num_names != msa->n_seq) {
176         vrna_message_warning("vrna_msa_add(): "
177                              "Too few names provided for sequences in MSA input! "
178                              "Expected %u but received %u ",
179                              msa->n_seq,
180                              num_names);
181       }
182     }
183 
184     for (s = 0; alignment[s]; s++) {
185       set_sequence(&(msa->sequences[s]),
186                    alignment[s],
187                    (s < num_names) ? names[s] : NULL,
188                    &(fc->params->model_details),
189                    options);
190     }
191 
192     if (orientation) {
193       /* check whether the all orientations are provided */
194       for (s = 0; s < msa->n_seq; s++) {
195         if (!orientation[s])
196           break;
197 
198         num_orientations++;
199       }
200 
201       if (s != msa->n_seq) {
202         vrna_message_warning("vrna_msa_add(): "
203                              "Too few orientations provided for sequences in MSA input! "
204                              "Expected %u but received %u ",
205                              msa->n_seq,
206                              num_orientations);
207       }
208 
209       msa->orientation = (unsigned char *)vrna_alloc(sizeof(unsigned char) * msa->n_seq);
210 
211       memcpy(msa->orientation, orientation, sizeof(unsigned char) * num_orientations);
212     }
213 
214     if (start) {
215       /* check whether the all orientations are provided */
216       for (s = 0; s < msa->n_seq; s++) {
217         if (!start[s])
218           break;
219 
220         num_starts++;
221       }
222 
223       if (s != msa->n_seq) {
224         vrna_message_warning("vrna_msa_add(): "
225                              "Too few start positions provided for sequences in MSA input! "
226                              "Expected %u but received %u ",
227                              msa->n_seq,
228                              num_starts);
229       }
230 
231       msa->start = (unsigned long long *)vrna_alloc(sizeof(unsigned long long) * msa->n_seq);
232 
233       memcpy(msa->start, start, sizeof(unsigned long long) * num_starts);
234     }
235 
236     if (genome_size) {
237       /* check whether the all orientations are provided */
238       for (s = 0; s < msa->n_seq; s++) {
239         if (!genome_size[s])
240           break;
241 
242         num_genome_sizes++;
243       }
244 
245       if (s != msa->n_seq) {
246         vrna_message_warning("vrna_msa_add(): "
247                              "Too few genome sizes provided for sequences in MSA input! "
248                              "Expected %u but received %u ",
249                              msa->n_seq,
250                              num_genome_sizes);
251       }
252 
253       msa->genome_size = (unsigned long long *)vrna_alloc(sizeof(unsigned long long) * msa->n_seq);
254 
255       memcpy(msa->genome_size, genome_size, sizeof(unsigned long long) * num_genome_sizes);
256     }
257 
258     /* now for the gap-free sequence properties */
259     msa->gapfree_seq  = (char **)vrna_alloc(sizeof(char *) * msa->n_seq);
260     msa->gapfree_size = (unsigned int *)vrna_alloc(sizeof(unsigned int) * msa->n_seq);
261     msa->a2s          = (unsigned int **)vrna_alloc(sizeof(unsigned int *) * msa->n_seq);
262 
263     for (s = 0; s < msa->n_seq; s++) {
264       msa->gapfree_seq[s]   = vrna_seq_ungapped(msa->sequences[s].string);
265       msa->gapfree_size[s]  = strlen(msa->gapfree_seq[s]);
266       msa->a2s[s]           =
267         (unsigned int *)vrna_alloc(sizeof(unsigned int) * (msa->sequences[s].length + 1));
268 
269       for (ss = 1, cnt = 0; ss <= msa->sequences[s].length; ss++) {
270         if (msa->sequences[s].encoding[ss])
271           cnt++;
272 
273         msa->a2s[s][ss] = cnt;
274       }
275     }
276 
277     /* increase strands counter */
278     fc->strands++;
279   }
280 
281   return ret;
282 }
283 
284 
285 PUBLIC int
vrna_sequence_remove(vrna_fold_compound_t * vc,unsigned int i)286 vrna_sequence_remove(vrna_fold_compound_t *vc,
287                      unsigned int         i)
288 {
289   unsigned int  size;
290   int           ret = 0;
291 
292   if (vc) {
293     if (i < vc->strands) {
294       free_sequence_data(&(vc->nucleotides[i]));
295       /* roll all nucleotide sequences behind the deleted one to the front */
296       size = vc->strands - i - 1;
297       if (size > 0)
298         memmove(vc->nucleotides + i, vc->nucleotides + i + 1, sizeof(vrna_seq_t) * size);
299 
300       vc->strands--;
301       vc->nucleotides =
302         (vrna_seq_t *)vrna_realloc(vc->nucleotides, sizeof(vrna_seq_t) * vc->strands);
303 
304       ret = 1;
305     }
306   }
307 
308   return ret;
309 }
310 
311 
312 PUBLIC void
vrna_sequence_remove_all(vrna_fold_compound_t * fc)313 vrna_sequence_remove_all(vrna_fold_compound_t *fc)
314 {
315   unsigned int i, s;
316 
317   if (fc) {
318     if (fc->type == VRNA_FC_TYPE_SINGLE) {
319       for (i = 0; i < fc->strands; i++)
320         free_sequence_data(&(fc->nucleotides[i]));
321 
322       free(fc->nucleotides);
323       fc->nucleotides = NULL;
324     } else {
325       for (i = 0; i < fc->strands; i++) {
326         for (s = 0; s < fc->alignment[i].n_seq; s++) {
327           free_sequence_data(&(fc->alignment[i].sequences[s]));
328           free(fc->alignment[i].gapfree_seq[s]);
329           free(fc->alignment[i].a2s[s]);
330         }
331 
332         free(fc->alignment[i].sequences);
333         free(fc->alignment[i].gapfree_seq);
334         free(fc->alignment[i].a2s);
335         free(fc->alignment[i].gapfree_size);
336         free(fc->alignment[i].genome_size);
337         free(fc->alignment[i].start);
338         free(fc->alignment[i].orientation);
339       }
340       free(fc->alignment);
341       fc->alignment = NULL;
342       /* free memory occupied by temporary hack in vrna_sequence_prepare() */
343       free_sequence_data(&(fc->nucleotides[0]));
344     }
345 
346     free(fc->strand_number);
347     free(fc->strand_order);
348     free(fc->strand_start);
349     free(fc->strand_end);
350 
351     fc->strands       = 0;
352     fc->strand_number = NULL;
353     fc->strand_order  = NULL;
354     fc->strand_start  = NULL;
355     fc->strand_end    = NULL;
356   }
357 }
358 
359 
360 PUBLIC void
vrna_sequence_prepare(vrna_fold_compound_t * fc)361 vrna_sequence_prepare(vrna_fold_compound_t *fc)
362 {
363   unsigned int cnt, i;
364 
365   if (fc) {
366     free(fc->strand_number);
367     free(fc->strand_order);
368     free(fc->strand_start);
369     free(fc->strand_end);
370 
371     fc->strand_order  = NULL;
372     fc->strand_start  = NULL;
373     fc->strand_end    = NULL;
374 
375     fc->strand_number = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->length + 2));
376 
377     switch (fc->type) {
378       case VRNA_FC_TYPE_SINGLE:
379         /* 1. store initial strand order */
380         fc->strand_order = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->strands + 1));
381         for (cnt = 0; cnt < fc->strands; cnt++)
382           fc->strand_order[cnt] = cnt;
383 
384         /* 2. mark start and end positions of sequences */
385         fc->strand_start  = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->strands + 1));
386         fc->strand_end    = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->strands + 1));
387 
388         fc->strand_start[0] = 1;
389         fc->strand_end[0]   = fc->strand_start[0] + fc->nucleotides[0].length - 1;
390 
391         for (cnt = 1; cnt < fc->strands; cnt++) {
392           fc->strand_start[cnt] = fc->strand_end[cnt - 1] + 1;
393           fc->strand_end[cnt]   = fc->strand_start[cnt] + fc->nucleotides[cnt].length - 1;
394           for (i = fc->strand_start[cnt]; i <= fc->strand_end[cnt]; i++)
395             fc->strand_number[i] = cnt;
396         }
397         /* this sets pos. n + 1 as well */
398         fc->strand_number[fc->length + 1] = fc->strands - 1;
399 
400         break;
401 
402       case VRNA_FC_TYPE_COMPARATIVE:
403         /*
404          *  for now, comparative structure prediction does not allow for RNA-RNA interactions,
405          *  so we pretend working on a single strand
406          */
407         fc->nucleotides = (vrna_seq_t *)vrna_realloc(fc->nucleotides,
408                                                      sizeof(vrna_seq_t) * (fc->strands + 1));
409         fc->nucleotides[0].string = NULL;
410         fc->nucleotides[0].type   = VRNA_SEQ_RNA;
411         fc->nucleotides[0].length = fc->length;
412 
413         /* 1. store initial strand order */
414         fc->strand_order = (unsigned int *)vrna_alloc(sizeof(unsigned int) * 2);
415 
416         /* 2. mark start and end positions of sequences */
417         fc->strand_start    = (unsigned int *)vrna_alloc(sizeof(unsigned int) * 2);
418         fc->strand_end      = (unsigned int *)vrna_alloc(sizeof(unsigned int) * 2);
419         fc->strand_start[0] = 1;
420         fc->strand_end[0]   = fc->strand_start[0] + fc->length - 1;
421 
422         break;
423     }
424   }
425 }
426 
427 
428 PUBLIC int
vrna_sequence_order_update(vrna_fold_compound_t * fc,const unsigned int * order)429 vrna_sequence_order_update(vrna_fold_compound_t *fc,
430                            const unsigned int   *order)
431 {
432   if ((fc) && (order)) {
433     /* first assign new order to strand_oder array */
434     memcpy(fc->strand_order, order, sizeof(unsigned int) * fc->strands);
435 
436     /* now, update strand_start/end positions and strand_number association */
437     fc->strand_start[order[0]]  = 1;
438     fc->strand_end[order[0]]    = fc->strand_start[order[0]] + fc->nucleotides[order[0]].length - 1;
439 
440     for (size_t j = fc->strand_start[order[0]]; j <= fc->strand_end[order[0]]; j++)
441       fc->strand_number[j] = order[0];
442 
443     for (size_t i = 1; i < fc->strands; i++) {
444       fc->strand_start[order[i]]  = fc->strand_end[order[i - 1]] + 1;
445       fc->strand_end[order[i]]    = fc->strand_start[order[i]] + fc->nucleotides[order[i]].length -
446                                     1;
447 
448       for (size_t j = fc->strand_start[order[i]]; j <= fc->strand_end[order[i]]; j++)
449         fc->strand_number[j] = order[i];
450     }
451 
452     /* also set pos. n + 1 for convenience reasons */
453     fc->strand_number[fc->length + 1] = order[fc->strands - 1];
454 
455     /* update the global concatenated sequence string */
456     for (size_t i = 0; i < fc->strands; i++)
457       memcpy(fc->sequence + fc->strand_start[order[i]] - 1,
458              fc->nucleotides[order[i]].string,
459              sizeof(char) * fc->nucleotides[order[i]].length);
460 
461     /* finally, update global sequence encoding(s) for new order */
462     for (size_t i = 0; i < fc->strands; i++)
463       memcpy(fc->sequence_encoding + fc->strand_start[order[i]],
464              fc->nucleotides[order[i]].encoding + 1,
465              sizeof(short) * fc->nucleotides[order[i]].length);
466 
467     fc->sequence_encoding[0]              = fc->sequence_encoding[fc->length];
468     fc->sequence_encoding[fc->length + 1] = fc->sequence_encoding[1];
469 
470     for (size_t i = 0; i < fc->strands; i++) {
471       short *enc = vrna_seq_encode_simple(fc->nucleotides[order[i]].string,
472                                           &(fc->params->model_details));
473       memcpy(fc->sequence_encoding2 + fc->strand_start[order[i]],
474              enc + 1,
475              sizeof(short) * fc->nucleotides[order[i]].length);
476       free(enc);
477     }
478 
479     fc->sequence_encoding2[0]               = (short)fc->length;
480     fc->sequence_encoding2[fc->length + 1]  = fc->sequence_encoding2[1];
481 
482     return 1;
483   }
484 
485   return 0;
486 }
487 
488 
489 PRIVATE void
set_sequence(vrna_seq_t * obj,const char * string,const char * name,vrna_md_t * md,unsigned int options)490 set_sequence(vrna_seq_t   *obj,
491              const char   *string,
492              const char   *name,
493              vrna_md_t    *md,
494              unsigned int options)
495 {
496   obj->name   = name ? strdup(name) : NULL;
497   obj->string = strdup(string);
498   vrna_seq_toupper(obj->string);
499   obj->length = strlen(obj->string);
500 
501   switch (options) {
502     default:
503       obj->type = VRNA_SEQ_RNA;
504   }
505 
506   obj->encoding   = vrna_seq_encode(obj->string, md);
507   obj->encoding5  = (short *)vrna_alloc(sizeof(short) * (obj->length + 1));
508   obj->encoding3  = (short *)vrna_alloc(sizeof(short) * (obj->length + 1));
509 
510   if (md->circ) {
511     for (size_t i = obj->length; i > 0; i--) {
512       if (obj->encoding[i] == 0) /* no nucleotide, i.e. gap */
513         continue;
514 
515       obj->encoding5[1] = obj->encoding[i];
516       break;
517     }
518     for (size_t i = 1; i <= obj->length; i++) {
519       if (obj->encoding[i] == 0) /* no nucleotide, i.e. gap */
520         continue;
521 
522       obj->encoding3[obj->length] = obj->encoding[i];
523       break;
524     }
525   } else {
526     obj->encoding5[1] = obj->encoding3[obj->length] = 0;
527   }
528 
529   for (size_t i = 1, p = 0; i < obj->length; i++) {
530     if (obj->encoding[i] == 0)
531       obj->encoding5[i + 1] = obj->encoding5[i];
532     else
533       obj->encoding5[i + 1] = obj->encoding[i];
534   }
535 
536   for (size_t i = obj->length; i > 1; i--) {
537     if (obj->encoding[i] == 0)
538       obj->encoding3[i - 1] = obj->encoding3[i];
539     else
540       obj->encoding3[i - 1] = obj->encoding[i];
541   }
542 }
543 
544 
545 PRIVATE void
free_sequence_data(vrna_seq_t * obj)546 free_sequence_data(vrna_seq_t *obj)
547 {
548   free(obj->string);
549   free(obj->name);
550   free(obj->encoding);
551   free(obj->encoding5);
552   free(obj->encoding3);
553   obj->string     = NULL;
554   obj->name       = NULL;
555   obj->encoding   = NULL;
556   obj->encoding5  = NULL;
557   obj->encoding3  = NULL;
558   obj->type       = VRNA_SEQ_UNKNOWN;
559   obj->length     = 0;
560 }
561