1 /*
2 * sequence.c
3 *
4 * Code for handling nucleotide sequences
5 *
6 * Part of the ViennaRNA Package
7 * (c) 2016 Ronny Lorenz
8 */
9
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <string.h>
13
14 #include "ViennaRNA/datastructures/basic.h"
15 #include "ViennaRNA/utils/basic.h"
16 #include "ViennaRNA/utils/strings.h"
17 #include "ViennaRNA/alphabet.h"
18 #include "ViennaRNA/sequence.h"
19
20 /*
21 #################################
22 # PRIVATE FUNCTION DECLARATIONS #
23 #################################
24 */
25 PRIVATE void
26 set_sequence(vrna_seq_t *obj,
27 const char *string,
28 const char *name,
29 vrna_md_t *md,
30 unsigned int options);
31
32
33 PRIVATE void
34 free_sequence_data(vrna_seq_t *obj);
35
36
37 /*
38 #################################
39 # BEGIN OF FUNCTION DEFINITIONS #
40 #################################
41 */
42 PUBLIC vrna_seq_t *
vrna_sequence(const char * string,unsigned int options)43 vrna_sequence(const char *string,
44 unsigned int options)
45 {
46 vrna_seq_t *data = NULL;
47
48 if (string) {
49 data = (vrna_seq_t *)vrna_alloc(sizeof(vrna_seq_t));
50 set_sequence(data, string, NULL, NULL, options);
51 }
52
53 return data;
54 }
55
56
57 PUBLIC int
vrna_sequence_add(vrna_fold_compound_t * vc,const char * string,unsigned int options)58 vrna_sequence_add(vrna_fold_compound_t *vc,
59 const char *string,
60 unsigned int options)
61 {
62 unsigned int add_length;
63 int ret = 0;
64
65 if ((vc) && (vc->type == VRNA_FC_TYPE_SINGLE) && (string)) {
66 add_length = strlen(string);
67
68 /* add the sequence to the nucleotides container */
69 vc->nucleotides = (vrna_seq_t *)vrna_realloc(vc->nucleotides,
70 sizeof(vrna_seq_t) *
71 (vc->strands + 1));
72 set_sequence(&(vc->nucleotides[vc->strands]),
73 string,
74 NULL,
75 &(vc->params->model_details),
76 options);
77
78 /* increase strands counter */
79 vc->strands++;
80
81 /* add new sequence to initial order of all strands */
82 vc->sequence = (char *)vrna_realloc(vc->sequence,
83 sizeof(char) *
84 (vc->length + add_length + 1));
85 memcpy(vc->sequence + vc->length,
86 vc->nucleotides[vc->strands - 1].string,
87 add_length * sizeof(char));
88 vc->sequence[vc->length + add_length] = '\0';
89
90 /* add encoding for new strand */
91 vc->sequence_encoding = (short *)vrna_realloc(vc->sequence_encoding,
92 sizeof(short) *
93 (vc->length + add_length + 2));
94
95 memcpy(vc->sequence_encoding + vc->length + 1,
96 vc->nucleotides[vc->strands - 1].encoding + 1,
97 add_length * sizeof(short));
98
99 /* restore circular encoding */
100 vc->sequence_encoding[vc->length + add_length + 1] = vc->sequence_encoding[1];
101 vc->sequence_encoding[0] =
102 vc->sequence_encoding[vc->length + add_length];
103
104 /* add encoding2 (simple encoding) for new strand */
105 vc->sequence_encoding2 = (short *)vrna_realloc(vc->sequence_encoding2,
106 sizeof(short) *
107 (vc->length + add_length + 2));
108 short *enc = vrna_seq_encode_simple(vc->nucleotides[vc->strands - 1].string,
109 &(vc->params->model_details));
110 memcpy(vc->sequence_encoding2 + vc->length + 1,
111 enc + 1,
112 add_length * sizeof(short));
113 free(enc);
114 vc->sequence_encoding2[vc->length + add_length + 1] = vc->sequence_encoding2[1];
115 vc->sequence_encoding2[0] = (short)(vc->length + add_length);
116
117 /* finally, increase length property of the fold compound */
118 vc->length = vc->length + add_length;
119
120 ret = 1;
121 }
122
123 return ret;
124 }
125
126
127 PUBLIC int
vrna_msa_add(vrna_fold_compound_t * fc,const char ** alignment,const char ** names,const unsigned char * orientation,const unsigned long long * start,const unsigned long long * genome_size,unsigned int options)128 vrna_msa_add(vrna_fold_compound_t *fc,
129 const char **alignment,
130 const char **names,
131 const unsigned char *orientation,
132 const unsigned long long *start,
133 const unsigned long long *genome_size,
134 unsigned int options)
135 {
136 int ret;
137
138 ret = 0;
139
140 if ((fc) &&
141 (fc->type == VRNA_FC_TYPE_COMPARATIVE) &&
142 (alignment)) {
143 size_t s, ss, cnt, num_names, num_orientations, num_starts, num_genome_sizes;
144 vrna_msa_t *msa;
145
146 num_names = num_orientations = num_starts = num_genome_sizes = 0;
147
148 /* add the sequence to the nucleotides container */
149 fc->alignment = (vrna_msa_t *)vrna_realloc(fc->alignment,
150 sizeof(vrna_msa_t) *
151 (fc->strands + 1));
152
153 /* count number of sequences in alignment */
154 for (s = 0; alignment[s]; s++);
155
156 msa = &(fc->alignment[fc->strands]);
157
158 msa->n_seq = s;
159 msa->sequences = (vrna_seq_t *)vrna_alloc(sizeof(vrna_seq_t) * s);
160 msa->orientation = NULL;
161 msa->start = NULL;
162 msa->genome_size = NULL;
163 msa->a2s = NULL; /* alignment column to nt number mapping */
164 msa->gapfree_seq = NULL; /* gap-free sequence */
165 msa->gapfree_size = NULL; /* gap-free sequence length */
166
167 if (names) {
168 for (s = 0; s < msa->n_seq; s++) {
169 if (!names[s])
170 break;
171
172 num_names++;
173 }
174
175 if (num_names != msa->n_seq) {
176 vrna_message_warning("vrna_msa_add(): "
177 "Too few names provided for sequences in MSA input! "
178 "Expected %u but received %u ",
179 msa->n_seq,
180 num_names);
181 }
182 }
183
184 for (s = 0; alignment[s]; s++) {
185 set_sequence(&(msa->sequences[s]),
186 alignment[s],
187 (s < num_names) ? names[s] : NULL,
188 &(fc->params->model_details),
189 options);
190 }
191
192 if (orientation) {
193 /* check whether the all orientations are provided */
194 for (s = 0; s < msa->n_seq; s++) {
195 if (!orientation[s])
196 break;
197
198 num_orientations++;
199 }
200
201 if (s != msa->n_seq) {
202 vrna_message_warning("vrna_msa_add(): "
203 "Too few orientations provided for sequences in MSA input! "
204 "Expected %u but received %u ",
205 msa->n_seq,
206 num_orientations);
207 }
208
209 msa->orientation = (unsigned char *)vrna_alloc(sizeof(unsigned char) * msa->n_seq);
210
211 memcpy(msa->orientation, orientation, sizeof(unsigned char) * num_orientations);
212 }
213
214 if (start) {
215 /* check whether the all orientations are provided */
216 for (s = 0; s < msa->n_seq; s++) {
217 if (!start[s])
218 break;
219
220 num_starts++;
221 }
222
223 if (s != msa->n_seq) {
224 vrna_message_warning("vrna_msa_add(): "
225 "Too few start positions provided for sequences in MSA input! "
226 "Expected %u but received %u ",
227 msa->n_seq,
228 num_starts);
229 }
230
231 msa->start = (unsigned long long *)vrna_alloc(sizeof(unsigned long long) * msa->n_seq);
232
233 memcpy(msa->start, start, sizeof(unsigned long long) * num_starts);
234 }
235
236 if (genome_size) {
237 /* check whether the all orientations are provided */
238 for (s = 0; s < msa->n_seq; s++) {
239 if (!genome_size[s])
240 break;
241
242 num_genome_sizes++;
243 }
244
245 if (s != msa->n_seq) {
246 vrna_message_warning("vrna_msa_add(): "
247 "Too few genome sizes provided for sequences in MSA input! "
248 "Expected %u but received %u ",
249 msa->n_seq,
250 num_genome_sizes);
251 }
252
253 msa->genome_size = (unsigned long long *)vrna_alloc(sizeof(unsigned long long) * msa->n_seq);
254
255 memcpy(msa->genome_size, genome_size, sizeof(unsigned long long) * num_genome_sizes);
256 }
257
258 /* now for the gap-free sequence properties */
259 msa->gapfree_seq = (char **)vrna_alloc(sizeof(char *) * msa->n_seq);
260 msa->gapfree_size = (unsigned int *)vrna_alloc(sizeof(unsigned int) * msa->n_seq);
261 msa->a2s = (unsigned int **)vrna_alloc(sizeof(unsigned int *) * msa->n_seq);
262
263 for (s = 0; s < msa->n_seq; s++) {
264 msa->gapfree_seq[s] = vrna_seq_ungapped(msa->sequences[s].string);
265 msa->gapfree_size[s] = strlen(msa->gapfree_seq[s]);
266 msa->a2s[s] =
267 (unsigned int *)vrna_alloc(sizeof(unsigned int) * (msa->sequences[s].length + 1));
268
269 for (ss = 1, cnt = 0; ss <= msa->sequences[s].length; ss++) {
270 if (msa->sequences[s].encoding[ss])
271 cnt++;
272
273 msa->a2s[s][ss] = cnt;
274 }
275 }
276
277 /* increase strands counter */
278 fc->strands++;
279 }
280
281 return ret;
282 }
283
284
285 PUBLIC int
vrna_sequence_remove(vrna_fold_compound_t * vc,unsigned int i)286 vrna_sequence_remove(vrna_fold_compound_t *vc,
287 unsigned int i)
288 {
289 unsigned int size;
290 int ret = 0;
291
292 if (vc) {
293 if (i < vc->strands) {
294 free_sequence_data(&(vc->nucleotides[i]));
295 /* roll all nucleotide sequences behind the deleted one to the front */
296 size = vc->strands - i - 1;
297 if (size > 0)
298 memmove(vc->nucleotides + i, vc->nucleotides + i + 1, sizeof(vrna_seq_t) * size);
299
300 vc->strands--;
301 vc->nucleotides =
302 (vrna_seq_t *)vrna_realloc(vc->nucleotides, sizeof(vrna_seq_t) * vc->strands);
303
304 ret = 1;
305 }
306 }
307
308 return ret;
309 }
310
311
312 PUBLIC void
vrna_sequence_remove_all(vrna_fold_compound_t * fc)313 vrna_sequence_remove_all(vrna_fold_compound_t *fc)
314 {
315 unsigned int i, s;
316
317 if (fc) {
318 if (fc->type == VRNA_FC_TYPE_SINGLE) {
319 for (i = 0; i < fc->strands; i++)
320 free_sequence_data(&(fc->nucleotides[i]));
321
322 free(fc->nucleotides);
323 fc->nucleotides = NULL;
324 } else {
325 for (i = 0; i < fc->strands; i++) {
326 for (s = 0; s < fc->alignment[i].n_seq; s++) {
327 free_sequence_data(&(fc->alignment[i].sequences[s]));
328 free(fc->alignment[i].gapfree_seq[s]);
329 free(fc->alignment[i].a2s[s]);
330 }
331
332 free(fc->alignment[i].sequences);
333 free(fc->alignment[i].gapfree_seq);
334 free(fc->alignment[i].a2s);
335 free(fc->alignment[i].gapfree_size);
336 free(fc->alignment[i].genome_size);
337 free(fc->alignment[i].start);
338 free(fc->alignment[i].orientation);
339 }
340 free(fc->alignment);
341 fc->alignment = NULL;
342 /* free memory occupied by temporary hack in vrna_sequence_prepare() */
343 free_sequence_data(&(fc->nucleotides[0]));
344 }
345
346 free(fc->strand_number);
347 free(fc->strand_order);
348 free(fc->strand_start);
349 free(fc->strand_end);
350
351 fc->strands = 0;
352 fc->strand_number = NULL;
353 fc->strand_order = NULL;
354 fc->strand_start = NULL;
355 fc->strand_end = NULL;
356 }
357 }
358
359
360 PUBLIC void
vrna_sequence_prepare(vrna_fold_compound_t * fc)361 vrna_sequence_prepare(vrna_fold_compound_t *fc)
362 {
363 unsigned int cnt, i;
364
365 if (fc) {
366 free(fc->strand_number);
367 free(fc->strand_order);
368 free(fc->strand_start);
369 free(fc->strand_end);
370
371 fc->strand_order = NULL;
372 fc->strand_start = NULL;
373 fc->strand_end = NULL;
374
375 fc->strand_number = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->length + 2));
376
377 switch (fc->type) {
378 case VRNA_FC_TYPE_SINGLE:
379 /* 1. store initial strand order */
380 fc->strand_order = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->strands + 1));
381 for (cnt = 0; cnt < fc->strands; cnt++)
382 fc->strand_order[cnt] = cnt;
383
384 /* 2. mark start and end positions of sequences */
385 fc->strand_start = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->strands + 1));
386 fc->strand_end = (unsigned int *)vrna_alloc(sizeof(unsigned int) * (fc->strands + 1));
387
388 fc->strand_start[0] = 1;
389 fc->strand_end[0] = fc->strand_start[0] + fc->nucleotides[0].length - 1;
390
391 for (cnt = 1; cnt < fc->strands; cnt++) {
392 fc->strand_start[cnt] = fc->strand_end[cnt - 1] + 1;
393 fc->strand_end[cnt] = fc->strand_start[cnt] + fc->nucleotides[cnt].length - 1;
394 for (i = fc->strand_start[cnt]; i <= fc->strand_end[cnt]; i++)
395 fc->strand_number[i] = cnt;
396 }
397 /* this sets pos. n + 1 as well */
398 fc->strand_number[fc->length + 1] = fc->strands - 1;
399
400 break;
401
402 case VRNA_FC_TYPE_COMPARATIVE:
403 /*
404 * for now, comparative structure prediction does not allow for RNA-RNA interactions,
405 * so we pretend working on a single strand
406 */
407 fc->nucleotides = (vrna_seq_t *)vrna_realloc(fc->nucleotides,
408 sizeof(vrna_seq_t) * (fc->strands + 1));
409 fc->nucleotides[0].string = NULL;
410 fc->nucleotides[0].type = VRNA_SEQ_RNA;
411 fc->nucleotides[0].length = fc->length;
412
413 /* 1. store initial strand order */
414 fc->strand_order = (unsigned int *)vrna_alloc(sizeof(unsigned int) * 2);
415
416 /* 2. mark start and end positions of sequences */
417 fc->strand_start = (unsigned int *)vrna_alloc(sizeof(unsigned int) * 2);
418 fc->strand_end = (unsigned int *)vrna_alloc(sizeof(unsigned int) * 2);
419 fc->strand_start[0] = 1;
420 fc->strand_end[0] = fc->strand_start[0] + fc->length - 1;
421
422 break;
423 }
424 }
425 }
426
427
428 PUBLIC int
vrna_sequence_order_update(vrna_fold_compound_t * fc,const unsigned int * order)429 vrna_sequence_order_update(vrna_fold_compound_t *fc,
430 const unsigned int *order)
431 {
432 if ((fc) && (order)) {
433 /* first assign new order to strand_oder array */
434 memcpy(fc->strand_order, order, sizeof(unsigned int) * fc->strands);
435
436 /* now, update strand_start/end positions and strand_number association */
437 fc->strand_start[order[0]] = 1;
438 fc->strand_end[order[0]] = fc->strand_start[order[0]] + fc->nucleotides[order[0]].length - 1;
439
440 for (size_t j = fc->strand_start[order[0]]; j <= fc->strand_end[order[0]]; j++)
441 fc->strand_number[j] = order[0];
442
443 for (size_t i = 1; i < fc->strands; i++) {
444 fc->strand_start[order[i]] = fc->strand_end[order[i - 1]] + 1;
445 fc->strand_end[order[i]] = fc->strand_start[order[i]] + fc->nucleotides[order[i]].length -
446 1;
447
448 for (size_t j = fc->strand_start[order[i]]; j <= fc->strand_end[order[i]]; j++)
449 fc->strand_number[j] = order[i];
450 }
451
452 /* also set pos. n + 1 for convenience reasons */
453 fc->strand_number[fc->length + 1] = order[fc->strands - 1];
454
455 /* update the global concatenated sequence string */
456 for (size_t i = 0; i < fc->strands; i++)
457 memcpy(fc->sequence + fc->strand_start[order[i]] - 1,
458 fc->nucleotides[order[i]].string,
459 sizeof(char) * fc->nucleotides[order[i]].length);
460
461 /* finally, update global sequence encoding(s) for new order */
462 for (size_t i = 0; i < fc->strands; i++)
463 memcpy(fc->sequence_encoding + fc->strand_start[order[i]],
464 fc->nucleotides[order[i]].encoding + 1,
465 sizeof(short) * fc->nucleotides[order[i]].length);
466
467 fc->sequence_encoding[0] = fc->sequence_encoding[fc->length];
468 fc->sequence_encoding[fc->length + 1] = fc->sequence_encoding[1];
469
470 for (size_t i = 0; i < fc->strands; i++) {
471 short *enc = vrna_seq_encode_simple(fc->nucleotides[order[i]].string,
472 &(fc->params->model_details));
473 memcpy(fc->sequence_encoding2 + fc->strand_start[order[i]],
474 enc + 1,
475 sizeof(short) * fc->nucleotides[order[i]].length);
476 free(enc);
477 }
478
479 fc->sequence_encoding2[0] = (short)fc->length;
480 fc->sequence_encoding2[fc->length + 1] = fc->sequence_encoding2[1];
481
482 return 1;
483 }
484
485 return 0;
486 }
487
488
489 PRIVATE void
set_sequence(vrna_seq_t * obj,const char * string,const char * name,vrna_md_t * md,unsigned int options)490 set_sequence(vrna_seq_t *obj,
491 const char *string,
492 const char *name,
493 vrna_md_t *md,
494 unsigned int options)
495 {
496 obj->name = name ? strdup(name) : NULL;
497 obj->string = strdup(string);
498 vrna_seq_toupper(obj->string);
499 obj->length = strlen(obj->string);
500
501 switch (options) {
502 default:
503 obj->type = VRNA_SEQ_RNA;
504 }
505
506 obj->encoding = vrna_seq_encode(obj->string, md);
507 obj->encoding5 = (short *)vrna_alloc(sizeof(short) * (obj->length + 1));
508 obj->encoding3 = (short *)vrna_alloc(sizeof(short) * (obj->length + 1));
509
510 if (md->circ) {
511 for (size_t i = obj->length; i > 0; i--) {
512 if (obj->encoding[i] == 0) /* no nucleotide, i.e. gap */
513 continue;
514
515 obj->encoding5[1] = obj->encoding[i];
516 break;
517 }
518 for (size_t i = 1; i <= obj->length; i++) {
519 if (obj->encoding[i] == 0) /* no nucleotide, i.e. gap */
520 continue;
521
522 obj->encoding3[obj->length] = obj->encoding[i];
523 break;
524 }
525 } else {
526 obj->encoding5[1] = obj->encoding3[obj->length] = 0;
527 }
528
529 for (size_t i = 1, p = 0; i < obj->length; i++) {
530 if (obj->encoding[i] == 0)
531 obj->encoding5[i + 1] = obj->encoding5[i];
532 else
533 obj->encoding5[i + 1] = obj->encoding[i];
534 }
535
536 for (size_t i = obj->length; i > 1; i--) {
537 if (obj->encoding[i] == 0)
538 obj->encoding3[i - 1] = obj->encoding3[i];
539 else
540 obj->encoding3[i - 1] = obj->encoding[i];
541 }
542 }
543
544
545 PRIVATE void
free_sequence_data(vrna_seq_t * obj)546 free_sequence_data(vrna_seq_t *obj)
547 {
548 free(obj->string);
549 free(obj->name);
550 free(obj->encoding);
551 free(obj->encoding5);
552 free(obj->encoding3);
553 obj->string = NULL;
554 obj->name = NULL;
555 obj->encoding = NULL;
556 obj->encoding5 = NULL;
557 obj->encoding3 = NULL;
558 obj->type = VRNA_SEQ_UNKNOWN;
559 obj->length = 0;
560 }
561