1 /*
2 * String alignment for RNA secondary structures
3 * Peter F Stadler, Ivo Hofacker
4 * Vienna RNA Package
5 */
6
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <math.h>
16 #include "ViennaRNA/edit_cost.h"
17 #include "ViennaRNA/dist_vars.h"
18 #include "ViennaRNA/utils/basic.h"
19
20 PUBLIC float
21 string_edit_distance(swString *T1,
22 swString *T2);
23
24
25 PUBLIC swString *
26 Make_swString(char *string);
27
28
29 PUBLIC void
30 print_swString(swString *x);
31
32
33 PUBLIC void
34 print_alignment_list(void);
35
36
37 PRIVATE void
38 sprint_aligned_swStrings(swString *T1,
39 swString *T2);
40
41
42 PRIVATE float
43 StrEditCost(int i,
44 int j,
45 swString *T1,
46 swString *T2);
47
48
49 PRIVATE void
50 DeCode(char *string,
51 int k,
52 int *tp,
53 float *w);
54
55
56 PRIVATE int
57 decode(char *id);
58
59
60 PRIVATE void
61 encode(int type,
62 char label[]);
63
64
65 PRIVATE int *alignment[2]; /* contains information from backtracking
66 * alignment[0][n] is the node in tree2
67 * matching node n in tree1 */
68
69
70 /*---------------------------------------------------------------------------*/
71
72 PUBLIC float
string_edit_distance(swString * T1,swString * T2)73 string_edit_distance(swString *T1,
74 swString *T2)
75
76 {
77 float **distance;
78 short **i_point, **j_point;
79
80 int i, j, i1, j1, pos, length1, length2;
81 float minus, plus, change, temp;
82
83 if (cost_matrix == 0)
84 EditCost = &UsualCost;
85 else
86 EditCost = &ShapiroCost;
87
88 i_point = NULL;
89 j_point = NULL;
90 length1 = T1[0].sign;
91 length2 = T2[0].sign;
92
93 distance = (float **)vrna_alloc((length1 + 1) * sizeof(float *));
94 if (edit_backtrack) {
95 i_point = (short **)vrna_alloc((length1 + 1) * sizeof(short *));
96 j_point = (short **)vrna_alloc((length1 + 1) * sizeof(short *));
97 }
98
99 for (i = 0; i <= length1; i++) {
100 distance[i] = (float *)vrna_alloc((length2 + 1) * sizeof(float));
101 if (edit_backtrack) {
102 i_point[i] = (short *)vrna_alloc((length2 + 1) * sizeof(short));
103 j_point[i] = (short *)vrna_alloc((length2 + 1) * sizeof(short));
104 }
105 }
106
107 for (i = 1; i <= length1; i++) {
108 if (edit_backtrack) {
109 i_point[i][0] = i - 1;
110 j_point[i][0] = 0;
111 }
112
113 distance[i][0] = distance[i - 1][0] + StrEditCost(i, 0, T1, T2);
114 }
115 for (j = 1; j <= length2; j++) {
116 if (edit_backtrack) {
117 j_point[0][j] = j - 1;
118 i_point[0][j] = 0;
119 }
120
121 distance[0][j] = distance[0][j - 1] + StrEditCost(0, j, T1, T2);
122 }
123
124 for (i = 1; i <= length1; i++) {
125 for (j = 1; j <= length2; j++) {
126 minus = distance[i - 1][j] + StrEditCost(i, 0, T1, T2);
127 plus = distance[i][j - 1] + StrEditCost(0, j, T1, T2);
128 change = distance[i - 1][j - 1] + StrEditCost(i, j, T1, T2);
129
130 distance[i][j] = MIN3(minus, plus, change);
131 /* printf("%g ", distance[i][j]); */
132
133 if (edit_backtrack) {
134 if (distance[i][j] == change) {
135 i_point[i][j] = i - 1;
136 j_point[i][j] = j - 1;
137 } else if (distance[i][j] == plus) {
138 i_point[i][j] = i;
139 j_point[i][j] = j - 1;
140 } else {
141 i_point[i][j] = i - 1;
142 j_point[i][j] = j;
143 }
144 }
145 }
146 /* printf("\n"); */
147 }
148 /* printf("\n"); */
149 temp = distance[length1][length2];
150 for (i = 0; i <= length1; i++)
151 free(distance[i]);
152 free(distance);
153
154 if (edit_backtrack) {
155 if (alignment[0] != NULL)
156 free(alignment[0]);
157
158 if (alignment[1] != NULL)
159 free(alignment[1]);
160
161 alignment[0] = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
162 alignment[1] = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
163
164 pos = length1 + length2;
165 i = length1;
166 j = length2;
167 while ((i > 0) || (j > 0)) {
168 i1 = i_point[i][j];
169 j1 = j_point[i][j];
170 if (((i - i1) == 1) && ((j - j1) == 1)) {
171 /* substitution */
172 alignment[0][pos] = i;
173 alignment[1][pos] = j;
174 }
175
176 if (((i - i1) == 1) && (j == j1)) {
177 /* Deletion in [1] */
178 alignment[0][pos] = i;
179 alignment[1][pos] = 0;
180 }
181
182 if ((i == i1) && ((j - j1) == 1)) {
183 /* Deletion in [0] */
184 alignment[0][pos] = 0;
185 alignment[1][pos] = j;
186 }
187
188 pos--;
189 i = i1;
190 j = j1;
191 }
192 for (i = pos + 1; i <= length1 + length2; i++) {
193 alignment[0][i - pos] = alignment[0][i];
194 alignment[1][i - pos] = alignment[1][i];
195 }
196 alignment[0][0] = length1 + length2 - pos; /* length of alignment */
197
198 for (i = 0; i <= length1; i++) {
199 free(i_point[i]);
200 free(j_point[i]);
201 }
202 free(i_point);
203 free(j_point);
204 sprint_aligned_swStrings(T1, T2);
205 }
206
207 return temp;
208 }
209
210
211 /*---------------------------------------------------------------------------*/
212
213 PRIVATE float
StrEditCost(int i,int j,swString * T1,swString * T2)214 StrEditCost(int i,
215 int j,
216 swString *T1,
217 swString *T2)
218 {
219 float c, diff, cd, min, a, b, dist;
220
221 if (i == 0) {
222 cd = (float)(*EditCost)[0][T2[j].type];
223 diff = T2[j].weight;
224 dist = cd * diff;
225 } else
226 if (j == 0) {
227 cd = (float)(*EditCost)[T1[i].type][0];
228 diff = T1[i].weight;
229 dist = cd * diff;
230 } else
231 if (((T1[i].sign) * (T2[j].sign)) > 0) {
232 c = (float)(*EditCost)[T1[i].type][T2[j].type];
233 diff = (float)fabs((a = T1[i].weight) - (b = T2[j].weight));
234 min = MIN2(a, b);
235 if (min == a)
236 cd = (float)(*EditCost)[0][T2[j].type];
237 else
238 cd = (float)(*EditCost)[T1[i].type][0];
239
240 dist = c * min + cd * diff;
241 } else {
242 dist = (float)DIST_INF;
243 }
244
245 return dist;
246 }
247
248
249 /*---------------------------------------------------------------------------*/
250
251 PUBLIC swString *
Make_swString(char * string)252 Make_swString(char *string)
253 {
254 int i = 0, j = 0, k = 0;
255 int tp, len, l, length;
256 float w;
257 swString *S;
258
259 length = strlen(string);
260
261 for (i = 0; i < length; i++) {
262 if ((string[i] == '(') || (string[i] == ')'))
263 j++;
264
265 if (string[i] == '.')
266 j += 2;
267 }
268
269 len = j;
270
271 S = (swString *)vrna_alloc(sizeof(swString) * (len + 1));
272 S[0].sign = j; /* number of entries */
273 S[0].weight = 0.0;
274 S[0].type = 0;
275
276 i = 0;
277 j = 1;
278 while (i < length) {
279 switch (string[i]) {
280 case '(':
281 S[j].sign = 1;
282 l = 1;
283 k = i;
284 while (l > 0) {
285 k++;
286 if (string[k] == '(')
287 l++;
288
289 if (string[k] == ')')
290 l--;
291 }
292 DeCode(string, k, &tp, &w);
293 S[j].type = tp;
294 S[j].weight = w / 2.0;
295 j++;
296 break;
297 case ')':
298 k = i;
299 S[j].sign = -1;
300 DeCode(string, k, &tp, &w);
301 S[j].type = tp;
302 S[j].weight = w / 2.0;
303 j++;
304 break;
305 case '.':
306 S[j].sign = 1;
307 S[j].type = 1;
308 S[j].weight = 0.5;
309 j++;
310 S[j].sign = -1;
311 S[j].type = 1;
312 S[j].weight = 0.5;
313 j++;
314 break;
315 }
316 i++;
317 }
318 return S;
319 }
320
321
322 /*---------------------------------------------------------------------------*/
323
324 PRIVATE void
DeCode(char * string,int k,int * tp,float * w)325 DeCode(char *string,
326 int k,
327 int *tp,
328 float *w)
329 /* retrieves type and weigth for a node closed by a bracket at position k */
330 {
331 int i, j, l, m;
332 char label[20], id[20];
333
334 i = k;
335 label[0] = '\0';
336 while (i >= 0) {
337 i--;
338 if ((string[i] == '(') || (string[i] == ')') || (string[i] == '.')) {
339 break;
340 } else {
341 label[k - i - 1] = string[i];
342 label[k - i] = '\0';
343 }
344 }
345 l = strlen(label);
346 if (l == 0) {
347 /* Dot-Bracket notation */
348 *w = 1.0;
349 *tp = 2;
350 } else {
351 for (i = 0; i < l; i++) {
352 if (!isalpha(label[l - i - 1]))
353 break;
354
355 id[i] = label[l - i - 1];
356 }
357 id[i] = '\0';
358 *tp = decode(id);
359 l = l - i - 1;
360 if (l >= 0) {
361 for (j = 0; j <= l; j++)
362 id[j] = label[l - j];
363 label[l + 1] = '\0';
364 m = -1;
365 sscanf(label, "%d", &m);
366 *w = (float)m;
367 if (m == -1) {
368 vrna_message_warning("Non-integer weight in DeCode ignored");
369 *w = 1.0;
370 }
371 } else {
372 *w = 1.0;
373 }
374 }
375 }
376
377
378 /*---------------------------------------------------------------------------*/
379
380 PRIVATE int
decode(char * id)381 decode(char *id)
382 {
383 int n, quit, i;
384 char label[100], *code;
385
386 n = 0;
387
388 quit = 0;
389 code = coding;
390
391 while (!quit) {
392 for (i = 0; code[i] != sep; i++) {
393 if (code[i] == '\0') {
394 quit = 1;
395 break;
396 }
397
398 label[i] = code[i];
399 }
400 label[i] = '\0';
401 if (strcmp(id, label) == 0)
402 return n;
403
404 code += (i + 1);
405 n++;
406 }
407
408 vrna_message_error("Syntax error: node identifier \"%s\" not found "
409 "in coding string \"%s\"\n"
410 "Exiting",
411 id, coding);
412 exit(0);
413 }
414
415
416 /*---------------------------------------------------------------------------*/
417
418 PRIVATE void
encode(int type,char label[])419 encode(int type,
420 char label[])
421
422 {
423 int i, l;
424
425 l = 0;
426 for (i = 0; i < type; i++) {
427 while (coding[l] != sep && coding[l])
428 l++;
429 l++;
430 }
431
432 for (i = 0; coding[l + i] != sep; i++) {
433 if (coding[l + i] == '\0')
434 break;
435
436 label[i] = coding[l + i];
437 }
438 label[i] = '\0';
439 }
440
441
442 /*---------------------------------------------------------------------------*/
443
444 PRIVATE void
sprint_aligned_swStrings(swString * T1,swString * T2)445 sprint_aligned_swStrings(swString *T1,
446 swString *T2)
447 {
448 int i, j, l0, l1, ltmp = 0, weights;
449 char label[10], *a0, *a1, tmp0[20], tmp1[20];
450
451 weights = 0;
452 for (i = 1; i <= T1[0].sign; i++)
453 weights = (weights || (T1[i].weight != 0.5));
454 for (i = 1; i <= T2[0].sign; i++)
455 weights = (weights || (T2[i].weight != 0.5));
456
457 a0 = (char *)vrna_alloc(alignment[0][0] * 4 + 2);
458 a1 = (char *)vrna_alloc(alignment[0][0] * 4 + 2);
459 for (i = 1; i <= alignment[0][0]; i++) {
460 tmp0[0] = '\0';
461 l0 = 0;
462 if (alignment[0][i] > 0) {
463 encode(T1[alignment[0][i]].type, label);
464 if (T1[alignment[0][i]].sign > 0) {
465 tmp0[0] = '(';
466 tmp0[1] = '\0';
467 }
468
469 strcat(tmp0, label);
470 if (weights)
471 sprintf(tmp0 + strlen(tmp0), "%d",
472 (int)(2 * T1[alignment[0][i]].weight));
473
474 if (T1[alignment[0][i]].sign < 0)
475 strcat(tmp0, ")");
476
477 l0 = strlen(tmp0);
478 }
479
480 tmp1[0] = '\0';
481 l1 = 0;
482 if (alignment[1][i] > 0) {
483 encode(T2[alignment[1][i]].type, label);
484 if (T2[alignment[1][i]].sign > 0) {
485 tmp1[0] = '(';
486 tmp1[1] = '\0';
487 }
488
489 strcat(tmp1, label);
490 if (weights)
491 sprintf(tmp1 + strlen(tmp1), "%d",
492 (int)(2 * T2[alignment[1][i]].weight));
493
494 if (T2[alignment[1][i]].sign < 0)
495 strcat(tmp1, ")");
496
497 l1 = strlen(tmp1);
498 }
499
500 ltmp = MAX2(l0, l1);
501 for (j = l0; j < ltmp; j++)
502 tmp0[j] = '_';
503 for (j = l1; j < ltmp; j++)
504 tmp1[j] = '_';
505 tmp0[ltmp] = '\0';
506 tmp1[ltmp] = '\0';
507
508 strcat(a0, tmp0);
509 strcat(a1, tmp1);
510 ltmp = strlen(a0);
511 }
512 if (aligned_line[0] != NULL) {
513 free(aligned_line[0]);
514 aligned_line[0] = NULL;
515 }
516
517 if (aligned_line[1] != NULL) {
518 free(aligned_line[1]);
519 aligned_line[1] = NULL;
520 }
521
522 aligned_line[0] = strdup(a0);
523 free(a0);
524 aligned_line[1] = strdup(a1);
525 free(a1);
526 }
527
528
529 /*---------------------------------------------------------------------------*/
530
531
532 PUBLIC void
print_swString(swString * x)533 print_swString(swString *x)
534 {
535 int i;
536
537 for (i = 0; i <= x[0].sign; i++)
538 printf("(%d,%d,%f\n) ", x[i].type, x[i].sign, x[i].weight);
539 printf("\n");
540 }
541
542
543 /*---------------------------------------------------------------------------*/
544
545 PUBLIC void
print_alignment_list(void)546 print_alignment_list(void)
547 {
548 int i;
549
550 printf("\n");
551 for (i = 1; i <= alignment[0][0]; i++)
552 printf("%3d ", alignment[0][i]);
553 printf("\n");
554 for (i = 1; i <= alignment[0][0]; i++)
555 printf("%3d ", alignment[1][i]);
556 printf("\n");
557 }
558