1 /*
2  * Fast, but crude, pairwise structural Alignments of RNA sequences
3  *
4  * Possible structures of each RNA are encoded in a linear
5  * "probability profile", by computing for each base the probability
6  * of being unpaired, or paired upstream or downstream. These profiles
7  * can be aligned using standard string alignment.
8  *
9  * The is an extension of the old method in ProfileDist.c with the
10  * following changes:
11  * - use sequence as well as structure profile for scoring
12  * - use similarity alignment instead of distance (maybe add local alinment)
13  * - use affine gap costs
14  *
15  *  C Ivo L Hofacker, Vienna RNA Package
16  */
17 
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21 
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <ctype.h>
26 #include <math.h>
27 #include <float.h>
28 #include "ViennaRNA/dist_vars.h"
29 #include "ViennaRNA/fold_vars.h"
30 #include "ViennaRNA/part_func.h"
31 #include "ViennaRNA/utils/basic.h"
32 #include "ViennaRNA/profiledist.h"
33 #include "ViennaRNA/ProfileAln.h"
34 
35 
36 #define EQUAL(x, y)     (fabs((x) - (y)) <= fabs(x) * 2 * FLT_EPSILON)
37 
38 PRIVATE int *alignment[2];
39 
40 PRIVATE void
41 sprint_aligned_bppm(const float *T1,
42                     const char  *seq1,
43                     const float *T2,
44                     const char  *seq2);
45 
46 
47 PRIVATE double
48 PrfEditScore(const float  *p1,
49              const float  *p2,
50              char         c1,
51              char         c2);
52 
53 
54 PRIVATE double
55 average(double  x,
56         double  y);
57 
58 
59 PRIVATE double  open = -1.5, ext = -0.666;  /* defaults from clustalw */
60 PRIVATE double  seqw      = 0.5;
61 PRIVATE int     free_ends = 1;              /* whether to use free end gaps */
62 
63 /*---------------------------------------------------------------------------*/
64 
65 PRIVATE float **
newmat(int l1,int l2)66 newmat(int  l1,
67        int  l2)
68 {
69   float **a;
70   int   i;
71 
72   a = (float **)vrna_alloc((l1 + 1) * sizeof(float *));
73   for (i = 0; i <= l1; i++)
74     a[i] = (float *)vrna_alloc((l2 + 1) * sizeof(float));
75   return a;
76 }
77 
78 
79 PUBLIC float
profile_aln(const float * T1,const char * seq1,const float * T2,const char * seq2)80 profile_aln(const float *T1,
81             const char  *seq1,
82             const float *T2,
83             const char  *seq2)
84 {
85   /* align the 2 probability profiles T1, T2 */
86   /* This is like a Needleman-Wunsch alignment, with affine gap-costs
87    * ala Gotoh. The score looks at both seq and pair profile */
88 
89   float **S, **E, **F, tot_score;
90   int   i, j, length1, length2;
91 
92   length1 = strlen(seq1);
93   length2 = strlen(seq2);
94   S       = newmat(length1, length2);
95   E       = newmat(length1, length2);
96   F       = newmat(length1, length2);
97 
98   E[0][0]   = F[0][0] = open - ext;
99   S[0][0]   = 0;
100   tot_score = -9999.;
101 
102   for (i = 1; i <= length1; i++)
103     F[i][0] = -9999;                          /* impossible */
104   for (j = 1; j <= length2; j++)
105     E[0][j] = -9999;                          /* impossible */
106   if (!free_ends) {
107     for (i = 1; i <= length1; i++)
108       S[i][0] = E[i][0] = E[i - 1][0] + ext;
109     for (j = 1; j <= length2; j++)
110       S[0][j] = F[0][j] = F[0][j - 1] + ext;
111   }
112 
113   for (i = 1; i <= length1; i++) {
114     for (j = 1; j <= length2; j++) {
115       float M;
116       E[i][j] = MAX2(E[i - 1][j] + ext, S[i - 1][j] + open);
117       F[i][j] = MAX2(F[i][j - 1] + ext, S[i][j - 1] + open);
118       M       = S[i - 1][j - 1] + PrfEditScore(T1 + 3 * i, T2 + 3 * j, seq1[i - 1], seq2[j - 1]);
119       S[i][j] = MAX3(M, E[i][j], F[i][j]);
120     }
121   }
122 
123   if (edit_backtrack) {
124     double  score = 0;
125     char    state = 'S';
126     int     pos, i, j;
127     alignment[0]  = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
128     alignment[1]  = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
129 
130     pos = length1 + length2;
131     i   = length1;
132     j   = length2;
133 
134     tot_score = S[length1][length2];
135 
136     if (free_ends) {
137       /* find starting point for backtracking,
138        * search for highest entry in last row or column */
139       int imax = 0;
140       for (i = 1; i <= length1; i++) {
141         if (S[i][length2] > score) {
142           score = S[i][length2];
143           imax  = i;
144         }
145       }
146       for (j = 1; j <= length2; j++) {
147         if (S[length1][j] > score) {
148           score = S[length1][j];
149           imax  = -j;
150         }
151       }
152       if (imax < 0) {
153         for (j = length2; j > -imax; j--) {
154           alignment[0][pos]   = 0;
155           alignment[1][pos--] = j;
156         }
157         i = length1;
158       } else {
159         for (i = length1; i > imax; i--) {
160           alignment[0][pos]   = i;
161           alignment[1][pos--] = 0;
162         }
163         j = length2;
164       }
165 
166       tot_score = score;
167     }
168 
169     while (i > 0 && j > 0) {
170       switch (state) {
171         case 'E':
172           score               = E[i][j];
173           alignment[0][pos]   = i;
174           alignment[1][pos--] = 0;
175           if (EQUAL(score, S[i - 1][j] + open))
176             state = 'S';
177 
178           i--;
179           break;
180         case 'F':
181           score               = F[i][j];
182           alignment[0][pos]   = 0;
183           alignment[1][pos--] = j;
184           if (EQUAL(score, S[i][j - 1] + open))
185             state = 'S';
186 
187           j--;
188           break;
189         case 'S':
190           score = S[i][j];
191           if (EQUAL(score, E[i][j])) {
192             state = 'E';
193           } else if (EQUAL(score, F[i][j])) {
194             state = 'F';
195           } else if (EQUAL(score, S[i - 1][j - 1] +
196                            PrfEditScore(T1 + 3 * i, T2 + 3 * j, seq1[i - 1], seq2[j - 1]))) {
197             alignment[0][pos]   = i;
198             alignment[1][pos--] = j;
199             i--;
200             j--;
201           } else {
202             vrna_message_error("backtrack of alignment failed");
203           }
204 
205           break;
206       }
207     }
208 
209     for (; j > 0; j--) {
210       alignment[0][pos]   = 0;
211       alignment[1][pos--] = j;
212     }
213     for (; i > 0; i--) {
214       alignment[0][pos]   = i;
215       alignment[1][pos--] = 0;
216     }
217 
218     for (i = pos + 1; i <= length1 + length2; i++) {
219       alignment[0][i - pos] = alignment[0][i];
220       alignment[1][i - pos] = alignment[1][i];
221     }
222     alignment[0][0] = length1 + length2 - pos;   /* length of alignment */
223 
224     sprint_aligned_bppm(T1, seq1, T2, seq2);
225     free(alignment[0]);
226     free(alignment[1]);
227   }
228 
229   for (i = 0; i <= length1; i++) {
230     free(S[i]);
231     free(E[i]);
232     free(F[i]);
233   }
234   free(S);
235   free(E);
236   free(F);
237 
238   return tot_score;
239 }
240 
241 
242 /*---------------------------------------------------------------------------*/
243 PRIVATE inline double
average(double x,double y)244 average(double  x,
245         double  y)
246 {
247   /*
248    * As in Bonhoeffer et al (1993) 'RNA Multi Structure Landscapes',
249    * Eur. Biophys. J. 22: 13-24 we have chosen  the geometric mean.
250    */
251   return (float)sqrt(x * y);
252 }
253 
254 
255 PRIVATE double
PrfEditScore(const float * p1,const float * p2,char c1,char c2)256 PrfEditScore(const float  *p1,
257              const float  *p2,
258              char         c1,
259              char         c2)
260 {
261   double  score;
262   int     k;
263 
264   for (score = 0., k = 0; k < 3; k++)
265     score += average(p1[k], p2[k]);
266 
267   score *= (1 - seqw);
268   if (c1 == c2)
269     score += seqw;
270   else if (((c1 == 'A') && (c2 == 'G')) ||
271            ((c1 == 'G') && (c2 == 'A')) ||
272            ((c1 == 'C') && (c2 == 'U')) ||
273            ((c1 == 'U') && (c2 == 'C')))
274     score += 0.5 * seqw;
275   else
276     score -= 0.9 * seqw;
277 
278   return score;
279 }
280 
281 
282 /*---------------------------------------------------------------------------*/
283 
284 PRIVATE void
sprint_aligned_bppm(const float * T1,const char * seq1,const float * T2,const char * seq2)285 sprint_aligned_bppm(const float *T1,
286                     const char  *seq1,
287                     const float *T2,
288                     const char  *seq2)
289 {
290   int i, length;
291 
292   length = alignment[0][0];
293   for (i = 0; i < 4; i++) {
294     if (aligned_line[i] != NULL)
295       free(aligned_line[i]);
296 
297     aligned_line[i] = (char *)vrna_alloc((length + 1) * sizeof(char));
298   }
299   for (i = 1; i <= length; i++) {
300     if (alignment[0][i] == 0) {
301       aligned_line[0][i - 1] = aligned_line[2][i - 1] = '_';
302     } else {
303       aligned_line[0][i - 1]  = vrna_bpp_symbol(T1 + alignment[0][i] * 3);
304       aligned_line[2][i - 1]  = seq1[alignment[0][i] - 1];
305     }
306 
307     if (alignment[1][i] == 0) {
308       aligned_line[1][i - 1] = aligned_line[3][i - 1] = '_';
309     } else {
310       aligned_line[1][i - 1]  = vrna_bpp_symbol(T2 + alignment[1][i] * 3);
311       aligned_line[3][i - 1]  = seq2[alignment[1][i] - 1];
312     }
313   }
314 }
315 
316 
317 PUBLIC int
set_paln_params(double gap_open,double gap_ext,double seq_weight,int freeends)318 set_paln_params(double  gap_open,
319                 double  gap_ext,
320                 double  seq_weight,
321                 int     freeends)
322 {
323   open  = (gap_open > 0) ? -gap_open : gap_open;
324   ext   = (gap_ext > 0) ? -gap_ext : gap_ext;
325   if (open > ext)
326     vrna_message_warning("Gap extension penalty is smaller than "
327                          "gap open. Do you realy want this?");
328 
329   seqw = seq_weight;
330   if (seqw < 0) {
331     seqw = 0;
332     vrna_message_warning("Sequence weight set to 0 (must be in [0..1])");
333   } else
334   if (seqw > 1) {
335     seqw = 1;
336     vrna_message_warning("Sequence weight set to 1 (must be in [0..1])");
337   }
338 
339   free_ends = (freeends) ? 1 : 0;
340   return 0;
341 }
342 
343 
344 /*---------------------------------------------------------------------------*/
345