1 /*
2 * ViennaRNA/utils/alignments.c
3 *
4 * Helper functions related to alignments
5 */
6
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <errno.h>
14 #include <time.h>
15 #include <string.h>
16 #include <ctype.h>
17 #include <math.h>
18
19 #include "ViennaRNA/utils/basic.h"
20 #include "ViennaRNA/utils/strings.h"
21 #include "ViennaRNA/fold_vars.h"
22 #include "ViennaRNA/pair_mat.h"
23 #include "ViennaRNA/model.h"
24 #include "ViennaRNA/ribo.h"
25 #include "ViennaRNA/alphabet.h"
26 #include "ViennaRNA/io/utils.h"
27 #include "ViennaRNA/utils/alignments.h"
28
29 /*
30 #################################
31 # GLOBAL VARIABLES #
32 #################################
33 */
34
35 /*
36 #################################
37 # PRIVATE VARIABLES #
38 #################################
39 */
40
41 /*
42 * IUP nucleotide classes indexed by a bit string of the present bases
43 * A C AC G AG CG ACG U AU CU ACU GU AGU CGU ACGU
44 */
45 static char IUP[17] = "-ACMGRSVUWYHKDBN";
46
47 /*
48 #################################
49 # PRIVATE FUNCTION DECLARATIONS #
50 #################################
51 */
52 PRIVATE char **
53 copy_alignment(const char **alignment,
54 unsigned int options);
55
56
57 /*
58 #################################
59 # BEGIN OF FUNCTION DEFINITIONS #
60 #################################
61 */
62 PUBLIC int
vrna_aln_mpi(const char ** alignment)63 vrna_aln_mpi(const char **alignment)
64 {
65 int i, j, k, s, n_seq, n, pairnum = 0, sumident = 0;
66 float ident = 0;
67
68 if (alignment) {
69 n = (int)strlen(alignment[0]);
70 for (s = 0; alignment[s]; s++);
71 n_seq = s;
72
73 for (j = 0; j < n_seq - 1; j++)
74 for (k = j + 1; k < n_seq; k++) {
75 ident = 0;
76 for (i = 1; i <= n; i++) {
77 if (alignment[k][i] == alignment[j][i])
78 ident++;
79
80 pairnum++;
81 }
82 sumident += ident;
83 }
84
85 if (pairnum > 0)
86 return (int)(sumident * 100 / pairnum);
87 }
88
89 return 0;
90 }
91
92
93 /*---------------------------------------------------------------------------*/
94 PRIVATE int
compare_pinfo(const void * pi1,const void * pi2)95 compare_pinfo(const void *pi1,
96 const void *pi2)
97 {
98 vrna_pinfo_t *p1, *p2;
99 int i, nc1, nc2;
100
101 p1 = (vrna_pinfo_t *)pi1;
102 p2 = (vrna_pinfo_t *)pi2;
103 for (nc1 = nc2 = 0, i = 1; i <= 6; i++) {
104 if (p1->bp[i] > 0)
105 nc1++;
106
107 if (p2->bp[i] > 0)
108 nc2++;
109 }
110 /* sort mostly by probability, add
111 * epsilon * comp_mutations/(non-compatible+1) to break ties */
112 return (p1->p + 0.01 * nc1 / (p1->bp[0] + 1.)) <
113 (p2->p + 0.01 * nc2 / (p2->bp[0] + 1.)) ? 1 : -1;
114 }
115
116
117 PUBLIC vrna_pinfo_t *
vrna_aln_pinfo(vrna_fold_compound_t * vc,const char * structure,double threshold)118 vrna_aln_pinfo(vrna_fold_compound_t *vc,
119 const char *structure,
120 double threshold)
121 {
122 int i, j, num_p = 0, max_p = 64;
123 vrna_pinfo_t *pi;
124 double *duck, p;
125 short *ptable = NULL;
126
127 short **S = vc->S;
128 char **AS = vc->sequences;
129 int n_seq = vc->n_seq;
130 int n = vc->length;
131 int *my_iindx = vc->iindx;
132 FLT_OR_DBL *probs = vc->exp_matrices->probs;
133 vrna_md_t *md = &(vc->exp_params->model_details);
134 int turn = md->min_loop_size;
135
136 max_p = 64;
137 pi = vrna_alloc(max_p * sizeof(vrna_pinfo_t));
138 duck = (double *)vrna_alloc((n + 1) * sizeof(double));
139 if (structure)
140 ptable = vrna_ptable(structure);
141
142 for (i = 1; i < n; i++)
143 for (j = i + turn + 1; j <= n; j++) {
144 if ((p = probs[my_iindx[i] - j]) >= threshold) {
145 duck[i] -= p * log(p);
146 duck[j] -= p * log(p);
147
148 int type, s;
149 pi[num_p].i = i;
150 pi[num_p].j = j;
151 pi[num_p].p = p;
152 pi[num_p].ent = duck[i] + duck[j] - p * log(p);
153
154 for (type = 0; type < 8; type++)
155 pi[num_p].bp[type] = 0;
156 for (s = 0; s < n_seq; s++) {
157 type = md->pair[S[s][i]][S[s][j]];
158 if (S[s][i] == 0 && S[s][j] == 0)
159 type = 7; /* gap-gap */
160
161 if ((AS[s][i - 1] == '-') || (AS[s][j - 1] == '-'))
162 type = 7;
163
164 if ((AS[s][i - 1] == '~') || (AS[s][j - 1] == '~'))
165 type = 7;
166
167 pi[num_p].bp[type]++;
168 }
169 if (ptable)
170 pi[num_p].comp = (ptable[i] == j) ? 1 : 0;
171
172 num_p++;
173 if (num_p >= max_p) {
174 max_p *= 2;
175 pi = vrna_realloc(pi, max_p * sizeof(vrna_pinfo_t));
176 }
177 }
178 }
179 free(duck);
180 pi = vrna_realloc(pi, (num_p + 1) * sizeof(vrna_pinfo_t));
181 pi[num_p].i = 0;
182 qsort(pi, num_p, sizeof(vrna_pinfo_t), compare_pinfo);
183
184 free(ptable);
185 return pi;
186 }
187
188
189 PUBLIC int *
vrna_aln_pscore(const char ** alignment,vrna_md_t * md)190 vrna_aln_pscore(const char **alignment,
191 vrna_md_t *md)
192 {
193 /*
194 * calculate co-variance bonus for each pair depending on
195 * compensatory/consistent mutations and incompatible seqs
196 * should be 0 for conserved pairs, >0 for good pairs
197 */
198
199 #define NONE -10000 /* score for forbidden pairs */
200
201 int i, j, k, l, s, n, n_seq, *indx, turn, max_span;
202 float **dm;
203 vrna_md_t md_default;
204 int *pscore;
205 short **S;
206
207 int olddm[7][7] = { { 0, 0, 0, 0, 0, 0, 0 }, /* hamming distance between pairs */
208 { 0, 0, 2, 2, 1, 2, 2 }, /* CG */
209 { 0, 2, 0, 1, 2, 2, 2 }, /* GC */
210 { 0, 2, 1, 0, 2, 1, 2 }, /* GU */
211 { 0, 1, 2, 2, 0, 2, 1 }, /* UG */
212 { 0, 2, 2, 1, 2, 0, 2 }, /* AU */
213 { 0, 2, 2, 2, 1, 2, 0 } /* UA */ };
214
215 pscore = NULL;
216
217 if (!md) {
218 vrna_md_set_default(&md_default);
219 md = &md_default;
220 }
221
222 if (alignment) {
223 /* length of alignment */
224 n = (int)strlen(alignment[0]);
225
226 /* count number of sequences */
227 for (s = 0; alignment[s]; s++);
228 n_seq = s;
229
230 /* make numeric encoding of sequences */
231 S = (short **)vrna_alloc(sizeof(short *) * (n_seq + 1));
232 for (s = 0; s < n_seq; s++)
233 S[s] = vrna_seq_encode_simple(alignment[s], md);
234
235 indx = vrna_idx_col_wise(n);
236
237 turn = md->min_loop_size;
238
239 pscore = (int *)vrna_alloc(sizeof(int) * ((n + 1) * (n + 2) / 2 + 2));
240
241 if (md->ribo) {
242 if (RibosumFile != NULL)
243 dm = readribosum(RibosumFile);
244 else
245 dm = get_ribosum(alignment, n_seq, n);
246 } else {
247 /*use usual matrix*/
248 dm = vrna_alloc(7 * sizeof(float *));
249 for (i = 0; i < 7; i++) {
250 dm[i] = vrna_alloc(7 * sizeof(float));
251 for (j = 0; j < 7; j++)
252 dm[i][j] = (float)olddm[i][j];
253 }
254 }
255
256 max_span = md->max_bp_span;
257 if ((max_span < turn + 2) || (max_span > n))
258 max_span = n;
259
260 for (i = 1; i < n; i++) {
261 for (j = i + 1; (j < i + turn + 1) && (j <= n); j++)
262 pscore[indx[j] + i] = NONE;
263 for (j = i + turn + 1; j <= n; j++) {
264 int pfreq[8] = {
265 0, 0, 0, 0, 0, 0, 0, 0
266 };
267 double score;
268 for (s = 0; s < n_seq; s++) {
269 int type;
270 if (S[s][i] == 0 && S[s][j] == 0) {
271 type = 7; /* gap-gap */
272 } else {
273 if ((alignment[s][i] == '~') || (alignment[s][j] == '~'))
274 type = 7;
275 else
276 type = md->pair[S[s][i]][S[s][j]];
277 }
278
279 pfreq[type]++;
280 }
281 if (pfreq[0] * 2 + pfreq[7] > n_seq) {
282 pscore[indx[j] + i] = NONE;
283 continue;
284 }
285
286 for (k = 1, score = 0; k <= 6; k++) /* ignore pairtype 7 (gap-gap) */
287 for (l = k; l <= 6; l++)
288 score += pfreq[k] * pfreq[l] * dm[k][l];
289 /* counter examples score -1, gap-gap scores -0.25 */
290 pscore[indx[j] + i] = md->cv_fact *
291 ((UNIT * score) / n_seq - md->nc_fact * UNIT *
292 (pfreq[0] + pfreq[7] * 0.25));
293
294 if ((j - i + 1) > max_span)
295 pscore[indx[j] + i] = NONE;
296 }
297 }
298
299 if (md->noLP) {
300 /* remove unwanted pairs */
301 for (k = 1; k < n - turn - 1; k++)
302 for (l = 1; l <= 2; l++) {
303 int type, ntype = 0, otype = 0;
304 i = k;
305 j = i + turn + l;
306 type = pscore[indx[j] + i];
307 while ((i >= 1) && (j <= n)) {
308 if ((i > 1) && (j < n))
309 ntype = pscore[indx[j + 1] + i - 1];
310
311 if ((otype < md->cv_fact * MINPSCORE) && (ntype < md->cv_fact * MINPSCORE)) /* too many counterexamples */
312 pscore[indx[j] + i] = NONE; /* i.j can only form isolated pairs */
313
314 otype = type;
315 type = ntype;
316 i--;
317 j++;
318 }
319 }
320 }
321
322 /*free dm */
323 for (i = 0; i < 7; i++)
324 free(dm[i]);
325 free(dm);
326
327 for (s = 0; s < n_seq; s++)
328 free(S[s]);
329 free(S);
330
331 free(indx);
332 }
333
334 return pscore;
335 }
336
337
338 PUBLIC char **
vrna_aln_slice(const char ** alignment,unsigned int i,unsigned int j)339 vrna_aln_slice(const char **alignment,
340 unsigned int i,
341 unsigned int j)
342 {
343 char **sub;
344 unsigned int n;
345 int n_seq, s;
346
347 sub = NULL;
348
349 if (alignment) {
350 /* check if [i,j] is within range */
351 n = strlen(alignment[0]);
352
353 if ((i < j) && (j <= n)) {
354 /* get number of sequences in alignment */
355 for (n_seq = 0; alignment[n_seq] != NULL; n_seq++);
356
357 sub = (char **)vrna_alloc(sizeof(char *) * (n_seq + 1));
358 for (s = 0; s < n_seq; s++)
359 sub[s] = vrna_alloc(sizeof(char) * (j - i + 2));
360 sub[s] = NULL;
361
362 /* copy subalignment */
363 for (s = 0; s < n_seq; s++) {
364 sub[s] = memcpy(sub[s], alignment[s] + i - 1, sizeof(char) * (j - i + 1));
365 sub[s][(j - i + 1)] = '\0';
366 }
367 }
368 }
369
370 return sub;
371 }
372
373
374 PUBLIC void
vrna_aln_free(char ** alignment)375 vrna_aln_free(char **alignment)
376 {
377 int n_seq;
378
379 if (alignment) {
380 for (n_seq = 0; alignment[n_seq] != NULL; n_seq++)
381 free(alignment[n_seq]);
382 free(alignment);
383 }
384 }
385
386
387 PUBLIC char **
vrna_aln_uppercase(const char ** alignment)388 vrna_aln_uppercase(const char **alignment)
389 {
390 if (alignment)
391 return copy_alignment(alignment, VRNA_ALN_UPPERCASE);
392
393 return NULL;
394 }
395
396
397 PUBLIC char **
vrna_aln_toRNA(const char ** alignment)398 vrna_aln_toRNA(const char **alignment)
399 {
400 if (alignment)
401 return copy_alignment(alignment, VRNA_ALN_RNA);
402
403 return NULL;
404 }
405
406
407 PUBLIC char **
vrna_aln_copy(const char ** alignment,unsigned int options)408 vrna_aln_copy(const char **alignment,
409 unsigned int options)
410 {
411 if (alignment)
412 return copy_alignment(alignment, options);
413
414 return NULL;
415 }
416
417
418 PUBLIC float *
vrna_aln_conservation_struct(const char ** alignment,const char * structure,const vrna_md_t * md_p)419 vrna_aln_conservation_struct(const char **alignment,
420 const char *structure,
421 const vrna_md_t *md_p)
422 {
423 unsigned int i, j, n, s, n_seq;
424 int a, b;
425 float *conservation = NULL;
426 short *pt;
427 vrna_md_t md;
428
429 if ((alignment) && (structure)) {
430 n = strlen(structure);
431 if (n > 0) {
432 /* check alignment for consistency */
433 for (s = 0; alignment[s]; s++) {
434 if (strlen(alignment[s]) != n) {
435 vrna_message_warning("vrna_aln_bpcons: Length of aligned sequence #%d does not match consensus structure length\n"
436 "%s\n\%s\n",
437 s + 1,
438 alignment[s],
439 structure);
440 return NULL;
441 }
442 }
443
444 n_seq = s;
445
446 if (md_p)
447 vrna_md_copy(&md, md_p);
448 else
449 vrna_md_set_default(&md);
450
451 pt = vrna_ptable(structure);
452 conservation = (float *)vrna_alloc(sizeof(float) * (n + 1));
453
454 for (i = 1; i < n; i++) {
455 if (i < pt[i]) {
456 j = pt[i];
457 for (s = 0; s < n_seq; s++) {
458 a = vrna_nucleotide_encode(alignment[s][i - 1], &md);
459 b = vrna_nucleotide_encode(alignment[s][j - 1], &md);
460 if (md.pair[a][b]) {
461 conservation[i] += 1.;
462 conservation[j] += 1.;
463 }
464 }
465 conservation[i] /= (float)n_seq;
466 conservation[j] /= (float)n_seq;
467 }
468 }
469
470 free(pt);
471 } else {
472 vrna_message_warning("vrna_aln_bpcons: Structure length is 0!");
473 }
474 }
475
476 return conservation;
477 }
478
479
480 PUBLIC float *
vrna_aln_conservation_col(const char ** alignment,const vrna_md_t * md_p,unsigned int options)481 vrna_aln_conservation_col(const char **alignment,
482 const vrna_md_t *md_p,
483 unsigned int options)
484 {
485 unsigned int i, n, s, n_seq;
486 float *conservation = NULL;
487 vrna_md_t md;
488
489 if (alignment) {
490 n = strlen(alignment[0]);
491 if (n > 0) {
492 /* check alignment for consistency */
493 for (s = 1; alignment[s]; s++) {
494 if (strlen(alignment[s]) != n) {
495 vrna_message_warning("vrna_aln_conservation: Length of aligned sequence #%d does not match length of first sequence\n"
496 "%s\n\n",
497 s + 1,
498 alignment[s]);
499 return NULL;
500 }
501 }
502
503 n_seq = s;
504
505 if (md_p)
506 vrna_md_copy(&md, md_p);
507 else
508 vrna_md_set_default(&md);
509
510 conservation = (float *)vrna_alloc(sizeof(float) * (n + 1));
511 for (i = 1; i <= n; i++) {
512 /*
513 * Here, we differentiate between 32 different nucleotide types.
514 * Change this if we ever cope with larger alphabets!
515 */
516 unsigned int nfreq[32] = {
517 0, 0, 0, 0, 0, 0, 0, 0,
518 0, 0, 0, 0, 0, 0, 0, 0,
519 0, 0, 0, 0, 0, 0, 0, 0,
520 0, 0, 0, 0, 0, 0, 0, 0
521 };
522
523 /* count frequencies of individual nucleotides */
524 for (s = 0; s < n_seq; s++)
525 nfreq[vrna_nucleotide_encode(alignment[s][i - 1], &md)]++;
526
527 if (options & VRNA_MEASURE_SHANNON_ENTROPY) {
528 double sum = 0;
529 for (s = 0; s < 32; s++) {
530 if (nfreq[s] > 0) {
531 double p = (double)nfreq[s] / (double)n_seq;
532 sum += p * log(p) / log(2.0);
533 }
534 }
535 conservation[i] = (float)(-sum);
536 }
537 }
538 } else {
539 vrna_message_warning("vrna_aln_conservation: Length of first sequence in alignment is 0!");
540 }
541 }
542
543 return conservation;
544 }
545
546
547 PUBLIC char *
vrna_aln_consensus_sequence(const char ** alignment,const vrna_md_t * md_p)548 vrna_aln_consensus_sequence(const char **alignment,
549 const vrna_md_t *md_p)
550 {
551 /* simple consensus sequence (most frequent character) */
552 char *consensus;
553 unsigned int i, n, s, n_seq;
554 vrna_md_t md;
555
556 consensus = NULL;
557
558 if (alignment) {
559 n = strlen(alignment[0]);
560 if (n > 0) {
561 /* check alignment for consistency */
562 for (s = 1; alignment[s]; s++) {
563 if (strlen(alignment[s]) != n) {
564 vrna_message_warning("vrna_aln_consensus_sequence: "
565 "Length of aligned sequence #%d does not match length of first sequence\n"
566 "%s\n\n",
567 s + 1,
568 alignment[s]);
569 return NULL;
570 }
571 }
572
573 n_seq = s;
574
575 if (md_p)
576 vrna_md_copy(&md, md_p);
577 else
578 vrna_md_set_default(&md);
579
580 consensus = (char *)vrna_alloc((n + 1) * sizeof(char));
581
582 for (i = 0; i < n; i++) {
583 int c, fm;
584 int freq[8] = {
585 0, 0, 0, 0, 0, 0, 0, 0
586 };
587
588 for (s = 0; s < n_seq; s++)
589 freq[vrna_nucleotide_encode(alignment[s][i], &md)]++;
590
591 for (s = c = fm = 0; s < 8; s++) /* find the most frequent char */
592 if (freq[s] > fm) {
593 c = s;
594 fm = freq[c];
595 }
596
597 if (s > 4)
598 s++; /* skip T */
599
600 consensus[i] = vrna_nucleotide_decode(c, &md);
601 }
602 }
603 }
604
605 return consensus;
606 }
607
608
609 PUBLIC char *
vrna_aln_consensus_mis(const char ** alignment,const vrna_md_t * md_p)610 vrna_aln_consensus_mis(const char **alignment,
611 const vrna_md_t *md_p)
612 {
613 /*
614 * MIS displays the 'most informative sequence' (Freyhult et al 2004),
615 * elements in columns with frequency greater than the background
616 * frequency are projected into iupac notation. Columns where gaps are
617 * over-represented are in lower case.
618 */
619 char *mis;
620 unsigned char c;
621 unsigned int i, n, s, n_seq;
622 unsigned int bgfreq[8] = {
623 0, 0, 0, 0, 0, 0, 0, 0
624 };
625 vrna_md_t md;
626
627 mis = NULL;
628
629 if (alignment) {
630 n = strlen(alignment[0]);
631 if (n > 0) {
632 /* check alignment for consistency */
633 for (s = 1; alignment[s]; s++) {
634 if (strlen(alignment[s]) != n) {
635 vrna_message_warning("vrna_aln_consensus_mis: "
636 "Length of aligned sequence #%d does not match length of first sequence\n"
637 "%s\n\n",
638 s + 1,
639 alignment[s]);
640 return NULL;
641 }
642 }
643
644 n_seq = s;
645
646 if (md_p)
647 vrna_md_copy(&md, md_p);
648 else
649 vrna_md_set_default(&md);
650
651 mis = (char *)vrna_alloc((n + 1) * sizeof(char));
652
653 /* determien backgroud frequencies */
654 for (i = 0; i < n; i++)
655 for (s = 0; s < n_seq; s++) {
656 c = (unsigned char)vrna_nucleotide_encode(alignment[s][i], &md);
657 if (c > 4)
658 c = 5;
659
660 bgfreq[c]++;
661 }
662
663 /* generate MIS */
664 for (i = 0; i < n; i++) {
665 int code = 0;
666 unsigned int freq[8] = {
667 0, 0, 0, 0, 0, 0, 0, 0
668 };
669
670 for (s = 0; s < n_seq; s++) {
671 c = (unsigned char)vrna_nucleotide_encode(alignment[s][i], &md);
672 if (c > 4)
673 c = 5;
674
675 freq[c]++;
676 }
677 for (c = 4; c > 0; c--) {
678 code <<= 1;
679 if (freq[c] * n >= bgfreq[c])
680 code++;
681 }
682 mis[i] = IUP[code];
683 if (freq[0] * n > bgfreq[0])
684 mis[i] = tolower(IUP[code]);
685 }
686 }
687 }
688
689 return mis;
690 }
691
692
693 /*
694 #####################################
695 # BEGIN OF STATIC HELPER FUNCTIONS #
696 #####################################
697 */
698 PRIVATE char **
copy_alignment(const char ** alignment,unsigned int options)699 copy_alignment(const char **alignment,
700 unsigned int options)
701 {
702 char **output = NULL;
703 unsigned int s;
704
705 /* determine number of sequence in the alignment */
706 for (s = 0; alignment[s]; s++);
707
708 /* allocate memory for output alignment */
709 output = (char **)vrna_alloc(sizeof(char *) * (s + 1));
710
711 /* copy alignment and convert to uppercase */
712 for (s = 0; alignment[s]; s++) {
713 output[s] = strdup(alignment[s]);
714
715 if (options & VRNA_ALN_UPPERCASE)
716 /* convert to uppercase letters */
717 vrna_seq_toupper(output[s]);
718
719 if (options & VRNA_ALN_RNA)
720 /* convert DNA to RNA alphabet */
721 vrna_seq_toRNA(output[s]);
722 }
723
724 /* mark end of alignment */
725 output[s] = NULL;
726
727 return output;
728 }
729
730
731 /*
732 *###########################################
733 *# deprecated functions below #
734 *###########################################
735 */
736
737 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
738
739 #define MAX_NUM_NAMES 500
740
741 PRIVATE void
742 encode_ali_sequence_old(const char *sequence,
743 short *S,
744 short *s5,
745 short *s3,
746 char *ss,
747 unsigned short *as,
748 int circular);
749
750
751 int
read_clustal(FILE * clust,char * AlignedSeqs[],char * names[])752 read_clustal(FILE *clust,
753 char *AlignedSeqs[],
754 char *names[])
755 {
756 char *line, name[100] = "", *seq;
757 int n, nn = 0, num_seq = 0, i;
758
759 if ((line = vrna_read_line(clust)) == NULL) {
760 vrna_message_warning("Empty CLUSTAL file");
761 return 0;
762 }
763
764 if ((strncmp(line, "CLUSTAL", 7) != 0) && (!strstr(line, "STOCKHOLM"))) {
765 vrna_message_warning("This doesn't look like a CLUSTAL/STOCKHOLM file, sorry");
766 free(line);
767 return 0;
768 }
769
770 free(line);
771 line = vrna_read_line(clust);
772
773 while (line != NULL) {
774 if (strncmp(line, "//", 2) == 0) {
775 free(line);
776 break;
777 }
778
779 if (((n = strlen(line)) < 4) || isspace((int)line[0])) {
780 /* skip non-sequence line */
781 free(line);
782 line = vrna_read_line(clust);
783 nn = 0; /* reset seqence number */
784 continue;
785 }
786
787 /* skip comments */
788 if (line[0] == '#') {
789 free(line);
790 line = vrna_read_line(clust);
791 continue;
792 }
793
794 seq = (char *)vrna_alloc((n + 1) * sizeof(char));
795 sscanf(line, "%99s %s", name, seq);
796
797 for (i = 0; i < strlen(seq); i++) {
798 if (seq[i] == '.')
799 seq[i] = '-'; /* replace '.' gaps by '-' */
800
801 /* comment the next line and think about something more difficult to deal with
802 * lowercase sequence letters if you really want to */
803 seq[i] = toupper(seq[i]);
804 }
805
806 if (nn == num_seq) {
807 /* first time */
808 names[nn] = strdup(name);
809 AlignedSeqs[nn] = strdup(seq);
810 } else {
811 if (strcmp(name, names[nn]) != 0) {
812 /* name doesn't match */
813 vrna_message_warning("Sorry, your file is messed up (inconsitent seq-names)");
814 free(line);
815 free(seq);
816 return 0;
817 }
818
819 AlignedSeqs[nn] = (char *)
820 vrna_realloc(AlignedSeqs[nn], strlen(seq) + strlen(AlignedSeqs[nn]) + 1);
821 strcat(AlignedSeqs[nn], seq);
822 }
823
824 nn++;
825 if (nn > num_seq)
826 num_seq = nn;
827
828 free(seq);
829 free(line);
830 if (num_seq >= MAX_NUM_NAMES) {
831 vrna_message_warning("Too many sequences in CLUSTAL/STOCKHOLM file");
832 return 0;
833 }
834
835 line = vrna_read_line(clust);
836 }
837
838 AlignedSeqs[num_seq] = NULL;
839 names[num_seq] = NULL;
840 if (num_seq == 0) {
841 vrna_message_warning("No sequences found in CLUSTAL/STOCKHOLM file");
842 return 0;
843 }
844
845 n = strlen(AlignedSeqs[0]);
846 for (nn = 1; nn < num_seq; nn++) {
847 if (strlen(AlignedSeqs[nn]) != n) {
848 vrna_message_warning("Sorry, your file is messed up.\n"
849 "Unequal lengths!");
850 return 0;
851 }
852 }
853
854 vrna_message_info(stderr, "%d sequences; length of alignment %d.", nn, n);
855 return num_seq;
856 }
857
858
859 char *
consensus(const char * AS[])860 consensus(const char *AS[])
861 {
862 /* simple consensus sequence (most frequent character) */
863 char *string;
864 int i, n;
865
866 string = NULL;
867
868 if (AS) {
869 n = strlen(AS[0]);
870 string = (char *)vrna_alloc((n + 1) * sizeof(char));
871 for (i = 0; i < n; i++) {
872 int s, c, fm, freq[8] = {
873 0, 0, 0, 0, 0, 0, 0, 0
874 };
875 for (s = 0; AS[s] != NULL; s++)
876 freq[encode_char(AS[s][i])]++;
877 for (s = c = fm = 0; s < 8; s++) /* find the most frequent char */
878 if (freq[s] > fm)
879 c = s, fm = freq[c];
880
881 if (s > 4)
882 s++; /* skip T */
883
884 string[i] = Law_and_Order[c];
885 }
886 }
887
888 return string;
889 }
890
891
892 char *
consens_mis(const char * AS[])893 consens_mis(const char *AS[])
894 {
895 /* MIS displays the 'most informative sequence' (Freyhult et al 2004),
896 * elements in columns with frequency greater than the background
897 * frequency are projected into iupac notation. Columns where gaps are
898 * over-represented are in lower case. */
899
900 char *cons;
901 int i, s, n, N, c;
902 int bgfreq[8] = {
903 0, 0, 0, 0, 0, 0, 0, 0
904 };
905
906 cons = NULL;
907
908 if (AS) {
909 n = strlen(AS[0]);
910 for (N = 0; AS[N] != NULL; N++);
911 cons = (char *)vrna_alloc((n + 1) * sizeof(char));
912
913 for (i = 0; i < n; i++)
914 for (s = 0; s < N; s++) {
915 c = encode_char(AS[s][i]);
916 if (c > 4)
917 c = 5;
918
919 bgfreq[c]++;
920 }
921
922 for (i = 0; i < n; i++) {
923 int freq[8] = {
924 0, 0, 0, 0, 0, 0, 0, 0
925 };
926 int code = 0;
927 for (s = 0; s < N; s++) {
928 c = encode_char(AS[s][i]);
929 if (c > 4)
930 c = 5;
931
932 freq[c]++;
933 }
934 for (c = 4; c > 0; c--) {
935 code <<= 1;
936 if (freq[c] * n >= bgfreq[c])
937 code++;
938 }
939 cons[i] = IUP[code];
940 if (freq[0] * n > bgfreq[0])
941 cons[i] = tolower(IUP[code]);
942 }
943 }
944
945 return cons;
946 }
947
948
949 PUBLIC char *
get_ungapped_sequence(const char * seq)950 get_ungapped_sequence(const char *seq)
951 {
952 char *tmp_sequence, *b;
953 int i;
954
955 tmp_sequence = strdup(seq);
956
957 b = tmp_sequence;
958 i = 0;
959 do {
960 if ((*b == '-') || (*b == '_') || (*b == '~') || (*b == '.'))
961 continue;
962
963 tmp_sequence[i] = *b;
964 i++;
965 } while (*(++b));
966
967 tmp_sequence = (char *)vrna_realloc(tmp_sequence, (i + 1) * sizeof(char));
968 tmp_sequence[i] = '\0';
969
970 return tmp_sequence;
971 }
972
973
974 PUBLIC int
get_mpi(char * Alseq[],int n_seq,int length,int * mini)975 get_mpi(char *Alseq[],
976 int n_seq,
977 int length,
978 int *mini)
979 {
980 int i, j, k, pairnum = 0, sumident = 0;
981 float ident = 0, minimum = 1.;
982
983 for (j = 0; j < n_seq - 1; j++)
984 for (k = j + 1; k < n_seq; k++) {
985 ident = 0;
986 for (i = 1; i <= length; i++) {
987 if (Alseq[k][i] == Alseq[j][i])
988 ident++;
989
990 pairnum++;
991 }
992 if ((ident / length) < minimum)
993 minimum = ident / (float)length;
994
995 sumident += ident;
996 }
997 mini[0] = (int)(minimum * 100.);
998 if (pairnum > 0)
999 return (int)(sumident * 100 / pairnum);
1000 else
1001 return 0;
1002 }
1003
1004
1005 PUBLIC void
alloc_sequence_arrays(const char ** sequences,short *** S,short *** S5,short *** S3,unsigned short *** a2s,char *** Ss,int circ)1006 alloc_sequence_arrays(const char **sequences,
1007 short ***S,
1008 short ***S5,
1009 short ***S3,
1010 unsigned short ***a2s,
1011 char ***Ss,
1012 int circ)
1013 {
1014 unsigned int s, n_seq, length;
1015
1016 if (sequences[0] != NULL) {
1017 length = strlen(sequences[0]);
1018 for (s = 0; sequences[s] != NULL; s++);
1019 n_seq = s;
1020 *S = (short **)vrna_alloc((n_seq + 1) * sizeof(short *));
1021 *S5 = (short **)vrna_alloc((n_seq + 1) * sizeof(short *));
1022 *S3 = (short **)vrna_alloc((n_seq + 1) * sizeof(short *));
1023 *a2s = (unsigned short **)vrna_alloc((n_seq + 1) * sizeof(unsigned short *));
1024 *Ss = (char **)vrna_alloc((n_seq + 1) * sizeof(char *));
1025 for (s = 0; s < n_seq; s++) {
1026 if (strlen(sequences[s]) != length)
1027 vrna_message_error("uneqal seqence lengths");
1028
1029 (*S5)[s] = (short *)vrna_alloc((length + 2) * sizeof(short));
1030 (*S3)[s] = (short *)vrna_alloc((length + 2) * sizeof(short));
1031 (*a2s)[s] = (unsigned short *)vrna_alloc((length + 2) * sizeof(unsigned short));
1032 (*Ss)[s] = (char *)vrna_alloc((length + 2) * sizeof(char));
1033 (*S)[s] = (short *)vrna_alloc((length + 2) * sizeof(short));
1034 encode_ali_sequence_old(sequences[s], (*S)[s], (*S5)[s], (*S3)[s], (*Ss)[s], (*a2s)[s], circ);
1035 }
1036 (*S5)[n_seq] = NULL;
1037 (*S3)[n_seq] = NULL;
1038 (*a2s)[n_seq] = NULL;
1039 (*Ss)[n_seq] = NULL;
1040 (*S)[n_seq] = NULL;
1041 } else {
1042 vrna_message_error("alloc_sequence_arrays: no sequences in the alignment!");
1043 }
1044 }
1045
1046
1047 PUBLIC void
free_sequence_arrays(unsigned int n_seq,short *** S,short *** S5,short *** S3,unsigned short *** a2s,char *** Ss)1048 free_sequence_arrays(unsigned int n_seq,
1049 short ***S,
1050 short ***S5,
1051 short ***S3,
1052 unsigned short ***a2s,
1053 char ***Ss)
1054 {
1055 unsigned int s;
1056
1057 for (s = 0; s < n_seq; s++) {
1058 free((*S)[s]);
1059 free((*S5)[s]);
1060 free((*S3)[s]);
1061 free((*a2s)[s]);
1062 free((*Ss)[s]);
1063 }
1064 free(*S);
1065 *S = NULL;
1066 free(*S5);
1067 *S5 = NULL;
1068 free(*S3);
1069 *S3 = NULL;
1070 free(*a2s);
1071 *a2s = NULL;
1072 free(*Ss);
1073 *Ss = NULL;
1074 }
1075
1076
1077 PUBLIC void
encode_ali_sequence(const char * sequence,short * S,short * s5,short * s3,char * ss,unsigned short * as,int circular)1078 encode_ali_sequence(const char *sequence,
1079 short *S,
1080 short *s5,
1081 short *s3,
1082 char *ss,
1083 unsigned short *as,
1084 int circular)
1085 {
1086 if ((sequence) &&
1087 (S) &&
1088 (s5) &&
1089 (s3) &&
1090 (ss) &&
1091 (as))
1092 encode_ali_sequence_old(sequence, S, s5, s3, ss, as, circular);
1093 }
1094
1095
1096 PRIVATE void
encode_ali_sequence_old(const char * sequence,short * S,short * s5,short * s3,char * ss,unsigned short * as,int circular)1097 encode_ali_sequence_old(const char *sequence,
1098 short *S,
1099 short *s5,
1100 short *s3,
1101 char *ss,
1102 unsigned short *as,
1103 int circular)
1104 {
1105 unsigned int i, l;
1106 unsigned short p;
1107
1108 l = strlen(sequence);
1109 S[0] = (short)l;
1110 s5[0] = s5[1] = 0;
1111
1112 /* make numerical encoding of sequence */
1113 for (i = 1; i <= l; i++) {
1114 short ctemp;
1115 ctemp = (short)encode_char(toupper(sequence[i - 1]));
1116 S[i] = ctemp;
1117 }
1118
1119 if (oldAliEn) {
1120 /* use alignment sequences in all energy evaluations */
1121 ss[0] = sequence[0];
1122 for (i = 1; i < l; i++) {
1123 s5[i] = S[i - 1];
1124 s3[i] = S[i + 1];
1125 ss[i] = sequence[i];
1126 as[i] = i;
1127 }
1128 ss[l] = sequence[l];
1129 as[l] = l;
1130 s5[l] = S[l - 1];
1131 s3[l] = 0;
1132 S[l + 1] = S[1];
1133 s5[1] = 0;
1134 if (circular) {
1135 s5[1] = S[l];
1136 s3[l] = S[1];
1137 ss[l + 1] = S[1];
1138 }
1139 } else {
1140 if (circular) {
1141 for (i = l; i > 0; i--) {
1142 char c5;
1143 c5 = sequence[i - 1];
1144 if ((c5 == '-') || (c5 == '_') || (c5 == '~') || (c5 == '.'))
1145 continue;
1146
1147 s5[1] = S[i];
1148 break;
1149 }
1150 for (i = 1; i <= l; i++) {
1151 char c3;
1152 c3 = sequence[i - 1];
1153 if ((c3 == '-') || (c3 == '_') || (c3 == '~') || (c3 == '.'))
1154 continue;
1155
1156 s3[l] = S[i];
1157 break;
1158 }
1159 } else {
1160 s5[1] = s3[l] = 0;
1161 }
1162
1163 for (i = 1, p = 0; i <= l; i++) {
1164 char c5;
1165 c5 = sequence[i - 1];
1166 if ((c5 == '-') || (c5 == '_') || (c5 == '~') || (c5 == '.')) {
1167 s5[i + 1] = s5[i];
1168 } else {
1169 /* no gap */
1170 ss[p++] = sequence[i - 1]; /*start at 0!!*/
1171 s5[i + 1] = S[i];
1172 }
1173
1174 as[i] = p;
1175 }
1176 for (i = l; i >= 1; i--) {
1177 char c3;
1178 c3 = sequence[i - 1];
1179 if ((c3 == '-') || (c3 == '_') || (c3 == '~') || (c3 == '.'))
1180 s3[i - 1] = s3[i];
1181 else
1182 s3[i - 1] = S[i];
1183 }
1184 }
1185 }
1186
1187
1188 #endif
1189