1 /*
2  *    ViennaRNA/utils/alignments.c
3  *
4  *    Helper functions related to alignments
5  */
6 
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <errno.h>
14 #include <time.h>
15 #include <string.h>
16 #include <ctype.h>
17 #include <math.h>
18 
19 #include "ViennaRNA/utils/basic.h"
20 #include "ViennaRNA/utils/strings.h"
21 #include "ViennaRNA/fold_vars.h"
22 #include "ViennaRNA/pair_mat.h"
23 #include "ViennaRNA/model.h"
24 #include "ViennaRNA/ribo.h"
25 #include "ViennaRNA/alphabet.h"
26 #include "ViennaRNA/io/utils.h"
27 #include "ViennaRNA/utils/alignments.h"
28 
29 /*
30  #################################
31  # GLOBAL VARIABLES              #
32  #################################
33  */
34 
35 /*
36  #################################
37  # PRIVATE VARIABLES             #
38  #################################
39  */
40 
41 /*
42  * IUP nucleotide classes indexed by a bit string of the present bases
43  * A C AC G AG CG ACG U AU CU ACU GU AGU CGU ACGU
44  */
45 static char IUP[17] = "-ACMGRSVUWYHKDBN";
46 
47 /*
48  #################################
49  # PRIVATE FUNCTION DECLARATIONS #
50  #################################
51  */
52 PRIVATE char **
53 copy_alignment(const char   **alignment,
54                unsigned int options);
55 
56 
57 /*
58  #################################
59  # BEGIN OF FUNCTION DEFINITIONS #
60  #################################
61  */
62 PUBLIC int
vrna_aln_mpi(const char ** alignment)63 vrna_aln_mpi(const char **alignment)
64 {
65   int   i, j, k, s, n_seq, n, pairnum = 0, sumident = 0;
66   float ident = 0;
67 
68   if (alignment) {
69     n = (int)strlen(alignment[0]);
70     for (s = 0; alignment[s]; s++);
71     n_seq = s;
72 
73     for (j = 0; j < n_seq - 1; j++)
74       for (k = j + 1; k < n_seq; k++) {
75         ident = 0;
76         for (i = 1; i <= n; i++) {
77           if (alignment[k][i] == alignment[j][i])
78             ident++;
79 
80           pairnum++;
81         }
82         sumident += ident;
83       }
84 
85     if (pairnum > 0)
86       return (int)(sumident * 100 / pairnum);
87   }
88 
89   return 0;
90 }
91 
92 
93 /*---------------------------------------------------------------------------*/
94 PRIVATE int
compare_pinfo(const void * pi1,const void * pi2)95 compare_pinfo(const void  *pi1,
96               const void  *pi2)
97 {
98   vrna_pinfo_t  *p1, *p2;
99   int           i, nc1, nc2;
100 
101   p1  = (vrna_pinfo_t *)pi1;
102   p2  = (vrna_pinfo_t *)pi2;
103   for (nc1 = nc2 = 0, i = 1; i <= 6; i++) {
104     if (p1->bp[i] > 0)
105       nc1++;
106 
107     if (p2->bp[i] > 0)
108       nc2++;
109   }
110   /* sort mostly by probability, add
111    * epsilon * comp_mutations/(non-compatible+1) to break ties */
112   return (p1->p + 0.01 * nc1 / (p1->bp[0] + 1.)) <
113          (p2->p + 0.01 * nc2 / (p2->bp[0] + 1.)) ? 1 : -1;
114 }
115 
116 
117 PUBLIC vrna_pinfo_t *
vrna_aln_pinfo(vrna_fold_compound_t * vc,const char * structure,double threshold)118 vrna_aln_pinfo(vrna_fold_compound_t *vc,
119                const char           *structure,
120                double               threshold)
121 {
122   int           i, j, num_p = 0, max_p = 64;
123   vrna_pinfo_t  *pi;
124   double        *duck, p;
125   short         *ptable = NULL;
126 
127   short         **S       = vc->S;
128   char          **AS      = vc->sequences;
129   int           n_seq     = vc->n_seq;
130   int           n         = vc->length;
131   int           *my_iindx = vc->iindx;
132   FLT_OR_DBL    *probs    = vc->exp_matrices->probs;
133   vrna_md_t     *md       = &(vc->exp_params->model_details);
134   int           turn      = md->min_loop_size;
135 
136   max_p = 64;
137   pi    = vrna_alloc(max_p * sizeof(vrna_pinfo_t));
138   duck  = (double *)vrna_alloc((n + 1) * sizeof(double));
139   if (structure)
140     ptable = vrna_ptable(structure);
141 
142   for (i = 1; i < n; i++)
143     for (j = i + turn + 1; j <= n; j++) {
144       if ((p = probs[my_iindx[i] - j]) >= threshold) {
145         duck[i] -= p * log(p);
146         duck[j] -= p * log(p);
147 
148         int type, s;
149         pi[num_p].i   = i;
150         pi[num_p].j   = j;
151         pi[num_p].p   = p;
152         pi[num_p].ent = duck[i] + duck[j] - p * log(p);
153 
154         for (type = 0; type < 8; type++)
155           pi[num_p].bp[type] = 0;
156         for (s = 0; s < n_seq; s++) {
157           type = md->pair[S[s][i]][S[s][j]];
158           if (S[s][i] == 0 && S[s][j] == 0)
159             type = 7;                            /* gap-gap  */
160 
161           if ((AS[s][i - 1] == '-') || (AS[s][j - 1] == '-'))
162             type = 7;
163 
164           if ((AS[s][i - 1] == '~') || (AS[s][j - 1] == '~'))
165             type = 7;
166 
167           pi[num_p].bp[type]++;
168         }
169         if (ptable)
170           pi[num_p].comp = (ptable[i] == j) ? 1 : 0;
171 
172         num_p++;
173         if (num_p >= max_p) {
174           max_p *= 2;
175           pi    = vrna_realloc(pi, max_p * sizeof(vrna_pinfo_t));
176         }
177       }
178     }
179   free(duck);
180   pi          = vrna_realloc(pi, (num_p + 1) * sizeof(vrna_pinfo_t));
181   pi[num_p].i = 0;
182   qsort(pi, num_p, sizeof(vrna_pinfo_t), compare_pinfo);
183 
184   free(ptable);
185   return pi;
186 }
187 
188 
189 PUBLIC int *
vrna_aln_pscore(const char ** alignment,vrna_md_t * md)190 vrna_aln_pscore(const char  **alignment,
191                 vrna_md_t   *md)
192 {
193   /*
194    * calculate co-variance bonus for each pair depending on
195    * compensatory/consistent mutations and incompatible seqs
196    * should be 0 for conserved pairs, >0 for good pairs
197    */
198 
199 #define NONE -10000 /* score for forbidden pairs */
200 
201   int       i, j, k, l, s, n, n_seq, *indx, turn, max_span;
202   float     **dm;
203   vrna_md_t md_default;
204   int       *pscore;
205   short     **S;
206 
207   int       olddm[7][7] = { { 0, 0, 0, 0, 0, 0, 0 },  /* hamming distance between pairs */
208                             { 0, 0, 2, 2, 1, 2, 2 },  /* CG */
209                             { 0, 2, 0, 1, 2, 2, 2 },  /* GC */
210                             { 0, 2, 1, 0, 2, 1, 2 },  /* GU */
211                             { 0, 1, 2, 2, 0, 2, 1 },  /* UG */
212                             { 0, 2, 2, 1, 2, 0, 2 },  /* AU */
213                             { 0, 2, 2, 2, 1, 2, 0 } /* UA */ };
214 
215   pscore = NULL;
216 
217   if (!md) {
218     vrna_md_set_default(&md_default);
219     md = &md_default;
220   }
221 
222   if (alignment) {
223     /* length of alignment */
224     n = (int)strlen(alignment[0]);
225 
226     /* count number of sequences */
227     for (s = 0; alignment[s]; s++);
228     n_seq = s;
229 
230     /* make numeric encoding of sequences */
231     S = (short **)vrna_alloc(sizeof(short *) * (n_seq + 1));
232     for (s = 0; s < n_seq; s++)
233       S[s] = vrna_seq_encode_simple(alignment[s], md);
234 
235     indx = vrna_idx_col_wise(n);
236 
237     turn = md->min_loop_size;
238 
239     pscore = (int *)vrna_alloc(sizeof(int) * ((n + 1) * (n + 2) / 2 + 2));
240 
241     if (md->ribo) {
242       if (RibosumFile != NULL)
243         dm = readribosum(RibosumFile);
244       else
245         dm = get_ribosum(alignment, n_seq, n);
246     } else {
247       /*use usual matrix*/
248       dm = vrna_alloc(7 * sizeof(float *));
249       for (i = 0; i < 7; i++) {
250         dm[i] = vrna_alloc(7 * sizeof(float));
251         for (j = 0; j < 7; j++)
252           dm[i][j] = (float)olddm[i][j];
253       }
254     }
255 
256     max_span = md->max_bp_span;
257     if ((max_span < turn + 2) || (max_span > n))
258       max_span = n;
259 
260     for (i = 1; i < n; i++) {
261       for (j = i + 1; (j < i + turn + 1) && (j <= n); j++)
262         pscore[indx[j] + i] = NONE;
263       for (j = i + turn + 1; j <= n; j++) {
264         int     pfreq[8] = {
265           0, 0, 0, 0, 0, 0, 0, 0
266         };
267         double  score;
268         for (s = 0; s < n_seq; s++) {
269           int type;
270           if (S[s][i] == 0 && S[s][j] == 0) {
271             type = 7;                             /* gap-gap  */
272           } else {
273             if ((alignment[s][i] == '~') || (alignment[s][j] == '~'))
274               type = 7;
275             else
276               type = md->pair[S[s][i]][S[s][j]];
277           }
278 
279           pfreq[type]++;
280         }
281         if (pfreq[0] * 2 + pfreq[7] > n_seq) {
282           pscore[indx[j] + i] = NONE;
283           continue;
284         }
285 
286         for (k = 1, score = 0; k <= 6; k++) /* ignore pairtype 7 (gap-gap) */
287           for (l = k; l <= 6; l++)
288             score += pfreq[k] * pfreq[l] * dm[k][l];
289         /* counter examples score -1, gap-gap scores -0.25   */
290         pscore[indx[j] + i] = md->cv_fact *
291                               ((UNIT * score) / n_seq - md->nc_fact * UNIT *
292                                (pfreq[0] + pfreq[7] * 0.25));
293 
294         if ((j - i + 1) > max_span)
295           pscore[indx[j] + i] = NONE;
296       }
297     }
298 
299     if (md->noLP) {
300       /* remove unwanted pairs */
301       for (k = 1; k < n - turn - 1; k++)
302         for (l = 1; l <= 2; l++) {
303           int type, ntype = 0, otype = 0;
304           i     = k;
305           j     = i + turn + l;
306           type  = pscore[indx[j] + i];
307           while ((i >= 1) && (j <= n)) {
308             if ((i > 1) && (j < n))
309               ntype = pscore[indx[j + 1] + i - 1];
310 
311             if ((otype < md->cv_fact * MINPSCORE) && (ntype < md->cv_fact * MINPSCORE)) /* too many counterexamples */
312               pscore[indx[j] + i] = NONE;                                               /* i.j can only form isolated pairs */
313 
314             otype = type;
315             type  = ntype;
316             i--;
317             j++;
318           }
319         }
320     }
321 
322     /*free dm */
323     for (i = 0; i < 7; i++)
324       free(dm[i]);
325     free(dm);
326 
327     for (s = 0; s < n_seq; s++)
328       free(S[s]);
329     free(S);
330 
331     free(indx);
332   }
333 
334   return pscore;
335 }
336 
337 
338 PUBLIC char **
vrna_aln_slice(const char ** alignment,unsigned int i,unsigned int j)339 vrna_aln_slice(const char   **alignment,
340                unsigned int i,
341                unsigned int j)
342 {
343   char          **sub;
344   unsigned int  n;
345   int           n_seq, s;
346 
347   sub = NULL;
348 
349   if (alignment) {
350     /* check if [i,j] is within range */
351     n = strlen(alignment[0]);
352 
353     if ((i < j) && (j <= n)) {
354       /* get number of sequences in alignment */
355       for (n_seq = 0; alignment[n_seq] != NULL; n_seq++);
356 
357       sub = (char **)vrna_alloc(sizeof(char *) * (n_seq + 1));
358       for (s = 0; s < n_seq; s++)
359         sub[s] = vrna_alloc(sizeof(char) * (j - i + 2));
360       sub[s] = NULL;
361 
362       /* copy subalignment */
363       for (s = 0; s < n_seq; s++) {
364         sub[s]              = memcpy(sub[s], alignment[s] + i - 1, sizeof(char) * (j - i + 1));
365         sub[s][(j - i + 1)] = '\0';
366       }
367     }
368   }
369 
370   return sub;
371 }
372 
373 
374 PUBLIC void
vrna_aln_free(char ** alignment)375 vrna_aln_free(char **alignment)
376 {
377   int n_seq;
378 
379   if (alignment) {
380     for (n_seq = 0; alignment[n_seq] != NULL; n_seq++)
381       free(alignment[n_seq]);
382     free(alignment);
383   }
384 }
385 
386 
387 PUBLIC char **
vrna_aln_uppercase(const char ** alignment)388 vrna_aln_uppercase(const char **alignment)
389 {
390   if (alignment)
391     return copy_alignment(alignment, VRNA_ALN_UPPERCASE);
392 
393   return NULL;
394 }
395 
396 
397 PUBLIC char **
vrna_aln_toRNA(const char ** alignment)398 vrna_aln_toRNA(const char **alignment)
399 {
400   if (alignment)
401     return copy_alignment(alignment, VRNA_ALN_RNA);
402 
403   return NULL;
404 }
405 
406 
407 PUBLIC char **
vrna_aln_copy(const char ** alignment,unsigned int options)408 vrna_aln_copy(const char    **alignment,
409               unsigned int  options)
410 {
411   if (alignment)
412     return copy_alignment(alignment, options);
413 
414   return NULL;
415 }
416 
417 
418 PUBLIC float *
vrna_aln_conservation_struct(const char ** alignment,const char * structure,const vrna_md_t * md_p)419 vrna_aln_conservation_struct(const char       **alignment,
420                              const char       *structure,
421                              const vrna_md_t  *md_p)
422 {
423   unsigned int  i, j, n, s, n_seq;
424   int           a, b;
425   float         *conservation = NULL;
426   short         *pt;
427   vrna_md_t     md;
428 
429   if ((alignment) && (structure)) {
430     n = strlen(structure);
431     if (n > 0) {
432       /* check alignment for consistency */
433       for (s = 0; alignment[s]; s++) {
434         if (strlen(alignment[s]) != n) {
435           vrna_message_warning("vrna_aln_bpcons: Length of aligned sequence #%d does not match consensus structure length\n"
436                                "%s\n\%s\n",
437                                s + 1,
438                                alignment[s],
439                                structure);
440           return NULL;
441         }
442       }
443 
444       n_seq = s;
445 
446       if (md_p)
447         vrna_md_copy(&md, md_p);
448       else
449         vrna_md_set_default(&md);
450 
451       pt            = vrna_ptable(structure);
452       conservation  = (float *)vrna_alloc(sizeof(float) * (n + 1));
453 
454       for (i = 1; i < n; i++) {
455         if (i < pt[i]) {
456           j = pt[i];
457           for (s = 0; s < n_seq; s++) {
458             a = vrna_nucleotide_encode(alignment[s][i - 1], &md);
459             b = vrna_nucleotide_encode(alignment[s][j - 1], &md);
460             if (md.pair[a][b]) {
461               conservation[i] += 1.;
462               conservation[j] += 1.;
463             }
464           }
465           conservation[i] /= (float)n_seq;
466           conservation[j] /= (float)n_seq;
467         }
468       }
469 
470       free(pt);
471     } else {
472       vrna_message_warning("vrna_aln_bpcons: Structure length is 0!");
473     }
474   }
475 
476   return conservation;
477 }
478 
479 
480 PUBLIC float *
vrna_aln_conservation_col(const char ** alignment,const vrna_md_t * md_p,unsigned int options)481 vrna_aln_conservation_col(const char      **alignment,
482                           const vrna_md_t *md_p,
483                           unsigned int    options)
484 {
485   unsigned int  i, n, s, n_seq;
486   float         *conservation = NULL;
487   vrna_md_t     md;
488 
489   if (alignment) {
490     n = strlen(alignment[0]);
491     if (n > 0) {
492       /* check alignment for consistency */
493       for (s = 1; alignment[s]; s++) {
494         if (strlen(alignment[s]) != n) {
495           vrna_message_warning("vrna_aln_conservation: Length of aligned sequence #%d does not match length of first sequence\n"
496                                "%s\n\n",
497                                s + 1,
498                                alignment[s]);
499           return NULL;
500         }
501       }
502 
503       n_seq = s;
504 
505       if (md_p)
506         vrna_md_copy(&md, md_p);
507       else
508         vrna_md_set_default(&md);
509 
510       conservation = (float *)vrna_alloc(sizeof(float) * (n + 1));
511       for (i = 1; i <= n; i++) {
512         /*
513          *  Here, we differentiate between 32 different nucleotide types.
514          *  Change this if we ever cope with larger alphabets!
515          */
516         unsigned int nfreq[32] = {
517           0, 0, 0, 0, 0, 0, 0, 0,
518           0, 0, 0, 0, 0, 0, 0, 0,
519           0, 0, 0, 0, 0, 0, 0, 0,
520           0, 0, 0, 0, 0, 0, 0, 0
521         };
522 
523         /* count frequencies of individual nucleotides */
524         for (s = 0; s < n_seq; s++)
525           nfreq[vrna_nucleotide_encode(alignment[s][i - 1], &md)]++;
526 
527         if (options & VRNA_MEASURE_SHANNON_ENTROPY) {
528           double sum = 0;
529           for (s = 0; s < 32; s++) {
530             if (nfreq[s] > 0) {
531               double p = (double)nfreq[s] / (double)n_seq;
532               sum += p * log(p) / log(2.0);
533             }
534           }
535           conservation[i] = (float)(-sum);
536         }
537       }
538     } else {
539       vrna_message_warning("vrna_aln_conservation: Length of first sequence in alignment is 0!");
540     }
541   }
542 
543   return conservation;
544 }
545 
546 
547 PUBLIC char *
vrna_aln_consensus_sequence(const char ** alignment,const vrna_md_t * md_p)548 vrna_aln_consensus_sequence(const char      **alignment,
549                             const vrna_md_t *md_p)
550 {
551   /* simple consensus sequence (most frequent character) */
552   char          *consensus;
553   unsigned int  i, n, s, n_seq;
554   vrna_md_t     md;
555 
556   consensus = NULL;
557 
558   if (alignment) {
559     n = strlen(alignment[0]);
560     if (n > 0) {
561       /* check alignment for consistency */
562       for (s = 1; alignment[s]; s++) {
563         if (strlen(alignment[s]) != n) {
564           vrna_message_warning("vrna_aln_consensus_sequence: "
565                                "Length of aligned sequence #%d does not match length of first sequence\n"
566                                "%s\n\n",
567                                s + 1,
568                                alignment[s]);
569           return NULL;
570         }
571       }
572 
573       n_seq = s;
574 
575       if (md_p)
576         vrna_md_copy(&md, md_p);
577       else
578         vrna_md_set_default(&md);
579 
580       consensus = (char *)vrna_alloc((n + 1) * sizeof(char));
581 
582       for (i = 0; i < n; i++) {
583         int c, fm;
584         int freq[8] = {
585           0, 0, 0, 0, 0, 0, 0, 0
586         };
587 
588         for (s = 0; s < n_seq; s++)
589           freq[vrna_nucleotide_encode(alignment[s][i], &md)]++;
590 
591         for (s = c = fm = 0; s < 8; s++) /* find the most frequent char */
592           if (freq[s] > fm) {
593             c   = s;
594             fm  = freq[c];
595           }
596 
597         if (s > 4)
598           s++;        /* skip T */
599 
600         consensus[i] = vrna_nucleotide_decode(c, &md);
601       }
602     }
603   }
604 
605   return consensus;
606 }
607 
608 
609 PUBLIC char *
vrna_aln_consensus_mis(const char ** alignment,const vrna_md_t * md_p)610 vrna_aln_consensus_mis(const char       **alignment,
611                        const vrna_md_t  *md_p)
612 {
613   /*
614    *  MIS displays the 'most informative sequence' (Freyhult et al 2004),
615    *  elements in columns with frequency greater than the background
616    *  frequency are projected into iupac notation. Columns where gaps are
617    *  over-represented are in lower case.
618    */
619   char          *mis;
620   unsigned char c;
621   unsigned int  i, n, s, n_seq;
622   unsigned int  bgfreq[8] = {
623     0, 0, 0, 0, 0, 0, 0, 0
624   };
625   vrna_md_t     md;
626 
627   mis = NULL;
628 
629   if (alignment) {
630     n = strlen(alignment[0]);
631     if (n > 0) {
632       /* check alignment for consistency */
633       for (s = 1; alignment[s]; s++) {
634         if (strlen(alignment[s]) != n) {
635           vrna_message_warning("vrna_aln_consensus_mis: "
636                                "Length of aligned sequence #%d does not match length of first sequence\n"
637                                "%s\n\n",
638                                s + 1,
639                                alignment[s]);
640           return NULL;
641         }
642       }
643 
644       n_seq = s;
645 
646       if (md_p)
647         vrna_md_copy(&md, md_p);
648       else
649         vrna_md_set_default(&md);
650 
651       mis = (char *)vrna_alloc((n + 1) * sizeof(char));
652 
653       /* determien backgroud frequencies */
654       for (i = 0; i < n; i++)
655         for (s = 0; s < n_seq; s++) {
656           c = (unsigned char)vrna_nucleotide_encode(alignment[s][i], &md);
657           if (c > 4)
658             c = 5;
659 
660           bgfreq[c]++;
661         }
662 
663       /* generate MIS */
664       for (i = 0; i < n; i++) {
665         int           code    = 0;
666         unsigned int  freq[8] = {
667           0, 0, 0, 0, 0, 0, 0, 0
668         };
669 
670         for (s = 0; s < n_seq; s++) {
671           c = (unsigned char)vrna_nucleotide_encode(alignment[s][i], &md);
672           if (c > 4)
673             c = 5;
674 
675           freq[c]++;
676         }
677         for (c = 4; c > 0; c--) {
678           code <<= 1;
679           if (freq[c] * n >= bgfreq[c])
680             code++;
681         }
682         mis[i] = IUP[code];
683         if (freq[0] * n > bgfreq[0])
684           mis[i] = tolower(IUP[code]);
685       }
686     }
687   }
688 
689   return mis;
690 }
691 
692 
693 /*
694  #####################################
695  # BEGIN OF STATIC HELPER FUNCTIONS  #
696  #####################################
697  */
698 PRIVATE char **
copy_alignment(const char ** alignment,unsigned int options)699 copy_alignment(const char   **alignment,
700                unsigned int options)
701 {
702   char          **output = NULL;
703   unsigned int  s;
704 
705   /* determine number of sequence in the alignment */
706   for (s = 0; alignment[s]; s++);
707 
708   /* allocate memory for output alignment */
709   output = (char **)vrna_alloc(sizeof(char *) * (s + 1));
710 
711   /* copy alignment and convert to uppercase */
712   for (s = 0; alignment[s]; s++) {
713     output[s] = strdup(alignment[s]);
714 
715     if (options & VRNA_ALN_UPPERCASE)
716       /* convert to uppercase letters */
717       vrna_seq_toupper(output[s]);
718 
719     if (options & VRNA_ALN_RNA)
720       /* convert DNA to RNA alphabet */
721       vrna_seq_toRNA(output[s]);
722   }
723 
724   /* mark end of alignment */
725   output[s] = NULL;
726 
727   return output;
728 }
729 
730 
731 /*
732  *###########################################
733  *# deprecated functions below              #
734  *###########################################
735  */
736 
737 #ifndef VRNA_DISABLE_BACKWARD_COMPATIBILITY
738 
739 #define MAX_NUM_NAMES    500
740 
741 PRIVATE void
742 encode_ali_sequence_old(const char      *sequence,
743                         short           *S,
744                         short           *s5,
745                         short           *s3,
746                         char            *ss,
747                         unsigned short  *as,
748                         int             circular);
749 
750 
751 int
read_clustal(FILE * clust,char * AlignedSeqs[],char * names[])752 read_clustal(FILE *clust,
753              char *AlignedSeqs[],
754              char *names[])
755 {
756   char  *line, name[100] = "", *seq;
757   int   n, nn = 0, num_seq = 0, i;
758 
759   if ((line = vrna_read_line(clust)) == NULL) {
760     vrna_message_warning("Empty CLUSTAL file");
761     return 0;
762   }
763 
764   if ((strncmp(line, "CLUSTAL", 7) != 0) && (!strstr(line, "STOCKHOLM"))) {
765     vrna_message_warning("This doesn't look like a CLUSTAL/STOCKHOLM file, sorry");
766     free(line);
767     return 0;
768   }
769 
770   free(line);
771   line = vrna_read_line(clust);
772 
773   while (line != NULL) {
774     if (strncmp(line, "//", 2) == 0) {
775       free(line);
776       break;
777     }
778 
779     if (((n = strlen(line)) < 4) || isspace((int)line[0])) {
780       /* skip non-sequence line */
781       free(line);
782       line  = vrna_read_line(clust);
783       nn    = 0; /* reset seqence number */
784       continue;
785     }
786 
787     /* skip comments */
788     if (line[0] == '#') {
789       free(line);
790       line = vrna_read_line(clust);
791       continue;
792     }
793 
794     seq = (char *)vrna_alloc((n + 1) * sizeof(char));
795     sscanf(line, "%99s %s", name, seq);
796 
797     for (i = 0; i < strlen(seq); i++) {
798       if (seq[i] == '.')
799         seq[i] = '-';                 /* replace '.' gaps by '-' */
800 
801       /* comment the next line and think about something more difficult to deal with
802        * lowercase sequence letters if you really want to */
803       seq[i] = toupper(seq[i]);
804     }
805 
806     if (nn == num_seq) {
807       /* first time */
808       names[nn]       = strdup(name);
809       AlignedSeqs[nn] = strdup(seq);
810     } else {
811       if (strcmp(name, names[nn]) != 0) {
812         /* name doesn't match */
813         vrna_message_warning("Sorry, your file is messed up (inconsitent seq-names)");
814         free(line);
815         free(seq);
816         return 0;
817       }
818 
819       AlignedSeqs[nn] = (char *)
820                         vrna_realloc(AlignedSeqs[nn], strlen(seq) + strlen(AlignedSeqs[nn]) + 1);
821       strcat(AlignedSeqs[nn], seq);
822     }
823 
824     nn++;
825     if (nn > num_seq)
826       num_seq = nn;
827 
828     free(seq);
829     free(line);
830     if (num_seq >= MAX_NUM_NAMES) {
831       vrna_message_warning("Too many sequences in CLUSTAL/STOCKHOLM file");
832       return 0;
833     }
834 
835     line = vrna_read_line(clust);
836   }
837 
838   AlignedSeqs[num_seq]  = NULL;
839   names[num_seq]        = NULL;
840   if (num_seq == 0) {
841     vrna_message_warning("No sequences found in CLUSTAL/STOCKHOLM file");
842     return 0;
843   }
844 
845   n = strlen(AlignedSeqs[0]);
846   for (nn = 1; nn < num_seq; nn++) {
847     if (strlen(AlignedSeqs[nn]) != n) {
848       vrna_message_warning("Sorry, your file is messed up.\n"
849                            "Unequal lengths!");
850       return 0;
851     }
852   }
853 
854   vrna_message_info(stderr, "%d sequences; length of alignment %d.", nn, n);
855   return num_seq;
856 }
857 
858 
859 char *
consensus(const char * AS[])860 consensus(const char *AS[])
861 {
862   /* simple consensus sequence (most frequent character) */
863   char  *string;
864   int   i, n;
865 
866   string = NULL;
867 
868   if (AS) {
869     n       = strlen(AS[0]);
870     string  = (char *)vrna_alloc((n + 1) * sizeof(char));
871     for (i = 0; i < n; i++) {
872       int s, c, fm, freq[8] = {
873         0, 0, 0, 0, 0, 0, 0, 0
874       };
875       for (s = 0; AS[s] != NULL; s++)
876         freq[encode_char(AS[s][i])]++;
877       for (s = c = fm = 0; s < 8; s++) /* find the most frequent char */
878         if (freq[s] > fm)
879           c = s, fm = freq[c];
880 
881       if (s > 4)
882         s++;        /* skip T */
883 
884       string[i] = Law_and_Order[c];
885     }
886   }
887 
888   return string;
889 }
890 
891 
892 char *
consens_mis(const char * AS[])893 consens_mis(const char *AS[])
894 {
895   /* MIS displays the 'most informative sequence' (Freyhult et al 2004),
896    * elements in columns with frequency greater than the background
897    * frequency are projected into iupac notation. Columns where gaps are
898    * over-represented are in lower case. */
899 
900   char  *cons;
901   int   i, s, n, N, c;
902   int   bgfreq[8] = {
903     0, 0, 0, 0, 0, 0, 0, 0
904   };
905 
906   cons = NULL;
907 
908   if (AS) {
909     n = strlen(AS[0]);
910     for (N = 0; AS[N] != NULL; N++);
911     cons = (char *)vrna_alloc((n + 1) * sizeof(char));
912 
913     for (i = 0; i < n; i++)
914       for (s = 0; s < N; s++) {
915         c = encode_char(AS[s][i]);
916         if (c > 4)
917           c = 5;
918 
919         bgfreq[c]++;
920       }
921 
922     for (i = 0; i < n; i++) {
923       int freq[8] = {
924         0, 0, 0, 0, 0, 0, 0, 0
925       };
926       int code = 0;
927       for (s = 0; s < N; s++) {
928         c = encode_char(AS[s][i]);
929         if (c > 4)
930           c = 5;
931 
932         freq[c]++;
933       }
934       for (c = 4; c > 0; c--) {
935         code <<= 1;
936         if (freq[c] * n >= bgfreq[c])
937           code++;
938       }
939       cons[i] = IUP[code];
940       if (freq[0] * n > bgfreq[0])
941         cons[i] = tolower(IUP[code]);
942     }
943   }
944 
945   return cons;
946 }
947 
948 
949 PUBLIC char *
get_ungapped_sequence(const char * seq)950 get_ungapped_sequence(const char *seq)
951 {
952   char  *tmp_sequence, *b;
953   int   i;
954 
955   tmp_sequence = strdup(seq);
956 
957   b = tmp_sequence;
958   i = 0;
959   do {
960     if ((*b == '-') || (*b == '_') || (*b == '~') || (*b == '.'))
961       continue;
962 
963     tmp_sequence[i] = *b;
964     i++;
965   } while (*(++b));
966 
967   tmp_sequence    = (char *)vrna_realloc(tmp_sequence, (i + 1) * sizeof(char));
968   tmp_sequence[i] = '\0';
969 
970   return tmp_sequence;
971 }
972 
973 
974 PUBLIC int
get_mpi(char * Alseq[],int n_seq,int length,int * mini)975 get_mpi(char  *Alseq[],
976         int   n_seq,
977         int   length,
978         int   *mini)
979 {
980   int   i, j, k, pairnum = 0, sumident = 0;
981   float ident = 0, minimum = 1.;
982 
983   for (j = 0; j < n_seq - 1; j++)
984     for (k = j + 1; k < n_seq; k++) {
985       ident = 0;
986       for (i = 1; i <= length; i++) {
987         if (Alseq[k][i] == Alseq[j][i])
988           ident++;
989 
990         pairnum++;
991       }
992       if ((ident / length) < minimum)
993         minimum = ident / (float)length;
994 
995       sumident += ident;
996     }
997   mini[0] = (int)(minimum * 100.);
998   if (pairnum > 0)
999     return (int)(sumident * 100 / pairnum);
1000   else
1001     return 0;
1002 }
1003 
1004 
1005 PUBLIC void
alloc_sequence_arrays(const char ** sequences,short *** S,short *** S5,short *** S3,unsigned short *** a2s,char *** Ss,int circ)1006 alloc_sequence_arrays(const char      **sequences,
1007                       short           ***S,
1008                       short           ***S5,
1009                       short           ***S3,
1010                       unsigned short  ***a2s,
1011                       char            ***Ss,
1012                       int             circ)
1013 {
1014   unsigned int s, n_seq, length;
1015 
1016   if (sequences[0] != NULL) {
1017     length = strlen(sequences[0]);
1018     for (s = 0; sequences[s] != NULL; s++);
1019     n_seq = s;
1020     *S    = (short **)vrna_alloc((n_seq + 1) * sizeof(short *));
1021     *S5   = (short **)vrna_alloc((n_seq + 1) * sizeof(short *));
1022     *S3   = (short **)vrna_alloc((n_seq + 1) * sizeof(short *));
1023     *a2s  = (unsigned short **)vrna_alloc((n_seq + 1) * sizeof(unsigned short *));
1024     *Ss   = (char **)vrna_alloc((n_seq + 1) * sizeof(char *));
1025     for (s = 0; s < n_seq; s++) {
1026       if (strlen(sequences[s]) != length)
1027         vrna_message_error("uneqal seqence lengths");
1028 
1029       (*S5)[s]  = (short *)vrna_alloc((length + 2) * sizeof(short));
1030       (*S3)[s]  = (short *)vrna_alloc((length + 2) * sizeof(short));
1031       (*a2s)[s] = (unsigned short *)vrna_alloc((length + 2) * sizeof(unsigned short));
1032       (*Ss)[s]  = (char *)vrna_alloc((length + 2) * sizeof(char));
1033       (*S)[s]   = (short *)vrna_alloc((length + 2) * sizeof(short));
1034       encode_ali_sequence_old(sequences[s], (*S)[s], (*S5)[s], (*S3)[s], (*Ss)[s], (*a2s)[s], circ);
1035     }
1036     (*S5)[n_seq]  = NULL;
1037     (*S3)[n_seq]  = NULL;
1038     (*a2s)[n_seq] = NULL;
1039     (*Ss)[n_seq]  = NULL;
1040     (*S)[n_seq]   = NULL;
1041   } else {
1042     vrna_message_error("alloc_sequence_arrays: no sequences in the alignment!");
1043   }
1044 }
1045 
1046 
1047 PUBLIC void
free_sequence_arrays(unsigned int n_seq,short *** S,short *** S5,short *** S3,unsigned short *** a2s,char *** Ss)1048 free_sequence_arrays(unsigned int   n_seq,
1049                      short          ***S,
1050                      short          ***S5,
1051                      short          ***S3,
1052                      unsigned short ***a2s,
1053                      char           ***Ss)
1054 {
1055   unsigned int s;
1056 
1057   for (s = 0; s < n_seq; s++) {
1058     free((*S)[s]);
1059     free((*S5)[s]);
1060     free((*S3)[s]);
1061     free((*a2s)[s]);
1062     free((*Ss)[s]);
1063   }
1064   free(*S);
1065   *S = NULL;
1066   free(*S5);
1067   *S5 = NULL;
1068   free(*S3);
1069   *S3 = NULL;
1070   free(*a2s);
1071   *a2s = NULL;
1072   free(*Ss);
1073   *Ss = NULL;
1074 }
1075 
1076 
1077 PUBLIC void
encode_ali_sequence(const char * sequence,short * S,short * s5,short * s3,char * ss,unsigned short * as,int circular)1078 encode_ali_sequence(const char      *sequence,
1079                     short           *S,
1080                     short           *s5,
1081                     short           *s3,
1082                     char            *ss,
1083                     unsigned short  *as,
1084                     int             circular)
1085 {
1086   if ((sequence) &&
1087       (S) &&
1088       (s5) &&
1089       (s3) &&
1090       (ss) &&
1091       (as))
1092     encode_ali_sequence_old(sequence, S, s5, s3, ss, as, circular);
1093 }
1094 
1095 
1096 PRIVATE void
encode_ali_sequence_old(const char * sequence,short * S,short * s5,short * s3,char * ss,unsigned short * as,int circular)1097 encode_ali_sequence_old(const char      *sequence,
1098                         short           *S,
1099                         short           *s5,
1100                         short           *s3,
1101                         char            *ss,
1102                         unsigned short  *as,
1103                         int             circular)
1104 {
1105   unsigned int    i, l;
1106   unsigned short  p;
1107 
1108   l     = strlen(sequence);
1109   S[0]  = (short)l;
1110   s5[0] = s5[1] = 0;
1111 
1112   /* make numerical encoding of sequence */
1113   for (i = 1; i <= l; i++) {
1114     short ctemp;
1115     ctemp = (short)encode_char(toupper(sequence[i - 1]));
1116     S[i]  = ctemp;
1117   }
1118 
1119   if (oldAliEn) {
1120     /* use alignment sequences in all energy evaluations */
1121     ss[0] = sequence[0];
1122     for (i = 1; i < l; i++) {
1123       s5[i] = S[i - 1];
1124       s3[i] = S[i + 1];
1125       ss[i] = sequence[i];
1126       as[i] = i;
1127     }
1128     ss[l]     = sequence[l];
1129     as[l]     = l;
1130     s5[l]     = S[l - 1];
1131     s3[l]     = 0;
1132     S[l + 1]  = S[1];
1133     s5[1]     = 0;
1134     if (circular) {
1135       s5[1]     = S[l];
1136       s3[l]     = S[1];
1137       ss[l + 1] = S[1];
1138     }
1139   } else {
1140     if (circular) {
1141       for (i = l; i > 0; i--) {
1142         char c5;
1143         c5 = sequence[i - 1];
1144         if ((c5 == '-') || (c5 == '_') || (c5 == '~') || (c5 == '.'))
1145           continue;
1146 
1147         s5[1] = S[i];
1148         break;
1149       }
1150       for (i = 1; i <= l; i++) {
1151         char c3;
1152         c3 = sequence[i - 1];
1153         if ((c3 == '-') || (c3 == '_') || (c3 == '~') || (c3 == '.'))
1154           continue;
1155 
1156         s3[l] = S[i];
1157         break;
1158       }
1159     } else {
1160       s5[1] = s3[l] = 0;
1161     }
1162 
1163     for (i = 1, p = 0; i <= l; i++) {
1164       char c5;
1165       c5 = sequence[i - 1];
1166       if ((c5 == '-') || (c5 == '_') || (c5 == '~') || (c5 == '.')) {
1167         s5[i + 1] = s5[i];
1168       } else {
1169         /* no gap */
1170         ss[p++]   = sequence[i - 1]; /*start at 0!!*/
1171         s5[i + 1] = S[i];
1172       }
1173 
1174       as[i] = p;
1175     }
1176     for (i = l; i >= 1; i--) {
1177       char c3;
1178       c3 = sequence[i - 1];
1179       if ((c3 == '-') || (c3 == '_') || (c3 == '~') || (c3 == '.'))
1180         s3[i - 1] = s3[i];
1181       else
1182         s3[i - 1] = S[i];
1183     }
1184   }
1185 }
1186 
1187 
1188 #endif
1189