1 /*
2 * Fast, but crude, pairwise structural Alignments of RNA sequences
3 *
4 * Possible structures of each RNA are encoded in a linear
5 * "probability profile", by computing for each base the probability
6 * of being unpaired, or paired upstream or downstream. These profiles
7 * can be aligned using standard string alignment.
8 *
9 * The is an extension of the old method in ProfileDist.c with the
10 * following changes:
11 * - use sequence as well as structure profile for scoring
12 * - use similarity alignment instead of distance (maybe add local alinment)
13 * - use affine gap costs
14 *
15 * C Ivo L Hofacker, Vienna RNA Package
16 */
17
18 #ifdef HAVE_CONFIG_H
19 #include "config.h"
20 #endif
21
22 #include <stdio.h>
23 #include <stdlib.h>
24 #include <string.h>
25 #include <ctype.h>
26 #include <math.h>
27 #include <float.h>
28 #include "ViennaRNA/dist_vars.h"
29 #include "ViennaRNA/fold_vars.h"
30 #include "ViennaRNA/part_func.h"
31 #include "ViennaRNA/utils/basic.h"
32 #include "ViennaRNA/profiledist.h"
33 #include "ViennaRNA/ProfileAln.h"
34
35
36 #define EQUAL(x, y) (fabs((x) - (y)) <= fabs(x) * 2 * FLT_EPSILON)
37
38 PRIVATE int *alignment[2];
39
40 PRIVATE void
41 sprint_aligned_bppm(const float *T1,
42 const char *seq1,
43 const float *T2,
44 const char *seq2);
45
46
47 PRIVATE double
48 PrfEditScore(const float *p1,
49 const float *p2,
50 char c1,
51 char c2);
52
53
54 PRIVATE double
55 average(double x,
56 double y);
57
58
59 PRIVATE double open = -1.5, ext = -0.666; /* defaults from clustalw */
60 PRIVATE double seqw = 0.5;
61 PRIVATE int free_ends = 1; /* whether to use free end gaps */
62
63 /*---------------------------------------------------------------------------*/
64
65 PRIVATE float **
newmat(int l1,int l2)66 newmat(int l1,
67 int l2)
68 {
69 float **a;
70 int i;
71
72 a = (float **)vrna_alloc((l1 + 1) * sizeof(float *));
73 for (i = 0; i <= l1; i++)
74 a[i] = (float *)vrna_alloc((l2 + 1) * sizeof(float));
75 return a;
76 }
77
78
79 PUBLIC float
profile_aln(const float * T1,const char * seq1,const float * T2,const char * seq2)80 profile_aln(const float *T1,
81 const char *seq1,
82 const float *T2,
83 const char *seq2)
84 {
85 /* align the 2 probability profiles T1, T2 */
86 /* This is like a Needleman-Wunsch alignment, with affine gap-costs
87 * ala Gotoh. The score looks at both seq and pair profile */
88
89 float **S, **E, **F, tot_score;
90 int i, j, length1, length2;
91
92 length1 = strlen(seq1);
93 length2 = strlen(seq2);
94 S = newmat(length1, length2);
95 E = newmat(length1, length2);
96 F = newmat(length1, length2);
97
98 E[0][0] = F[0][0] = open - ext;
99 S[0][0] = 0;
100 tot_score = -9999.;
101
102 for (i = 1; i <= length1; i++)
103 F[i][0] = -9999; /* impossible */
104 for (j = 1; j <= length2; j++)
105 E[0][j] = -9999; /* impossible */
106 if (!free_ends) {
107 for (i = 1; i <= length1; i++)
108 S[i][0] = E[i][0] = E[i - 1][0] + ext;
109 for (j = 1; j <= length2; j++)
110 S[0][j] = F[0][j] = F[0][j - 1] + ext;
111 }
112
113 for (i = 1; i <= length1; i++) {
114 for (j = 1; j <= length2; j++) {
115 float M;
116 E[i][j] = MAX2(E[i - 1][j] + ext, S[i - 1][j] + open);
117 F[i][j] = MAX2(F[i][j - 1] + ext, S[i][j - 1] + open);
118 M = S[i - 1][j - 1] + PrfEditScore(T1 + 3 * i, T2 + 3 * j, seq1[i - 1], seq2[j - 1]);
119 S[i][j] = MAX3(M, E[i][j], F[i][j]);
120 }
121 }
122
123 if (edit_backtrack) {
124 double score = 0;
125 char state = 'S';
126 int pos, i, j;
127 alignment[0] = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
128 alignment[1] = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
129
130 pos = length1 + length2;
131 i = length1;
132 j = length2;
133
134 tot_score = S[length1][length2];
135
136 if (free_ends) {
137 /* find starting point for backtracking,
138 * search for highest entry in last row or column */
139 int imax = 0;
140 for (i = 1; i <= length1; i++) {
141 if (S[i][length2] > score) {
142 score = S[i][length2];
143 imax = i;
144 }
145 }
146 for (j = 1; j <= length2; j++) {
147 if (S[length1][j] > score) {
148 score = S[length1][j];
149 imax = -j;
150 }
151 }
152 if (imax < 0) {
153 for (j = length2; j > -imax; j--) {
154 alignment[0][pos] = 0;
155 alignment[1][pos--] = j;
156 }
157 i = length1;
158 } else {
159 for (i = length1; i > imax; i--) {
160 alignment[0][pos] = i;
161 alignment[1][pos--] = 0;
162 }
163 j = length2;
164 }
165
166 tot_score = score;
167 }
168
169 while (i > 0 && j > 0) {
170 switch (state) {
171 case 'E':
172 score = E[i][j];
173 alignment[0][pos] = i;
174 alignment[1][pos--] = 0;
175 if (EQUAL(score, S[i - 1][j] + open))
176 state = 'S';
177
178 i--;
179 break;
180 case 'F':
181 score = F[i][j];
182 alignment[0][pos] = 0;
183 alignment[1][pos--] = j;
184 if (EQUAL(score, S[i][j - 1] + open))
185 state = 'S';
186
187 j--;
188 break;
189 case 'S':
190 score = S[i][j];
191 if (EQUAL(score, E[i][j])) {
192 state = 'E';
193 } else if (EQUAL(score, F[i][j])) {
194 state = 'F';
195 } else if (EQUAL(score, S[i - 1][j - 1] +
196 PrfEditScore(T1 + 3 * i, T2 + 3 * j, seq1[i - 1], seq2[j - 1]))) {
197 alignment[0][pos] = i;
198 alignment[1][pos--] = j;
199 i--;
200 j--;
201 } else {
202 vrna_message_error("backtrack of alignment failed");
203 }
204
205 break;
206 }
207 }
208
209 for (; j > 0; j--) {
210 alignment[0][pos] = 0;
211 alignment[1][pos--] = j;
212 }
213 for (; i > 0; i--) {
214 alignment[0][pos] = i;
215 alignment[1][pos--] = 0;
216 }
217
218 for (i = pos + 1; i <= length1 + length2; i++) {
219 alignment[0][i - pos] = alignment[0][i];
220 alignment[1][i - pos] = alignment[1][i];
221 }
222 alignment[0][0] = length1 + length2 - pos; /* length of alignment */
223
224 sprint_aligned_bppm(T1, seq1, T2, seq2);
225 free(alignment[0]);
226 free(alignment[1]);
227 }
228
229 for (i = 0; i <= length1; i++) {
230 free(S[i]);
231 free(E[i]);
232 free(F[i]);
233 }
234 free(S);
235 free(E);
236 free(F);
237
238 return tot_score;
239 }
240
241
242 /*---------------------------------------------------------------------------*/
243 PRIVATE inline double
average(double x,double y)244 average(double x,
245 double y)
246 {
247 /*
248 * As in Bonhoeffer et al (1993) 'RNA Multi Structure Landscapes',
249 * Eur. Biophys. J. 22: 13-24 we have chosen the geometric mean.
250 */
251 return (float)sqrt(x * y);
252 }
253
254
255 PRIVATE double
PrfEditScore(const float * p1,const float * p2,char c1,char c2)256 PrfEditScore(const float *p1,
257 const float *p2,
258 char c1,
259 char c2)
260 {
261 double score;
262 int k;
263
264 for (score = 0., k = 0; k < 3; k++)
265 score += average(p1[k], p2[k]);
266
267 score *= (1 - seqw);
268 if (c1 == c2)
269 score += seqw;
270 else if (((c1 == 'A') && (c2 == 'G')) ||
271 ((c1 == 'G') && (c2 == 'A')) ||
272 ((c1 == 'C') && (c2 == 'U')) ||
273 ((c1 == 'U') && (c2 == 'C')))
274 score += 0.5 * seqw;
275 else
276 score -= 0.9 * seqw;
277
278 return score;
279 }
280
281
282 /*---------------------------------------------------------------------------*/
283
284 PRIVATE void
sprint_aligned_bppm(const float * T1,const char * seq1,const float * T2,const char * seq2)285 sprint_aligned_bppm(const float *T1,
286 const char *seq1,
287 const float *T2,
288 const char *seq2)
289 {
290 int i, length;
291
292 length = alignment[0][0];
293 for (i = 0; i < 4; i++) {
294 if (aligned_line[i] != NULL)
295 free(aligned_line[i]);
296
297 aligned_line[i] = (char *)vrna_alloc((length + 1) * sizeof(char));
298 }
299 for (i = 1; i <= length; i++) {
300 if (alignment[0][i] == 0) {
301 aligned_line[0][i - 1] = aligned_line[2][i - 1] = '_';
302 } else {
303 aligned_line[0][i - 1] = vrna_bpp_symbol(T1 + alignment[0][i] * 3);
304 aligned_line[2][i - 1] = seq1[alignment[0][i] - 1];
305 }
306
307 if (alignment[1][i] == 0) {
308 aligned_line[1][i - 1] = aligned_line[3][i - 1] = '_';
309 } else {
310 aligned_line[1][i - 1] = vrna_bpp_symbol(T2 + alignment[1][i] * 3);
311 aligned_line[3][i - 1] = seq2[alignment[1][i] - 1];
312 }
313 }
314 }
315
316
317 PUBLIC int
set_paln_params(double gap_open,double gap_ext,double seq_weight,int freeends)318 set_paln_params(double gap_open,
319 double gap_ext,
320 double seq_weight,
321 int freeends)
322 {
323 open = (gap_open > 0) ? -gap_open : gap_open;
324 ext = (gap_ext > 0) ? -gap_ext : gap_ext;
325 if (open > ext)
326 vrna_message_warning("Gap extension penalty is smaller than "
327 "gap open. Do you realy want this?");
328
329 seqw = seq_weight;
330 if (seqw < 0) {
331 seqw = 0;
332 vrna_message_warning("Sequence weight set to 0 (must be in [0..1])");
333 } else
334 if (seqw > 1) {
335 seqw = 1;
336 vrna_message_warning("Sequence weight set to 1 (must be in [0..1])");
337 }
338
339 free_ends = (freeends) ? 1 : 0;
340 return 0;
341 }
342
343
344 /*---------------------------------------------------------------------------*/
345