1 /*
2  *  String alignment for RNA secondary structures
3  *        Peter F Stadler, Ivo Hofacker
4  *       Vienna RNA Package
5  */
6 
7 #ifdef HAVE_CONFIG_H
8 #include "config.h"
9 #endif
10 
11 #include <stdio.h>
12 #include <stdlib.h>
13 #include <string.h>
14 #include <ctype.h>
15 #include <math.h>
16 #include "ViennaRNA/edit_cost.h"
17 #include "ViennaRNA/dist_vars.h"
18 #include "ViennaRNA/utils/basic.h"
19 
20 PUBLIC float
21 string_edit_distance(swString *T1,
22                      swString *T2);
23 
24 
25 PUBLIC swString *
26 Make_swString(char *string);
27 
28 
29 PUBLIC void
30 print_swString(swString *x);
31 
32 
33 PUBLIC void
34 print_alignment_list(void);
35 
36 
37 PRIVATE void
38 sprint_aligned_swStrings(swString *T1,
39                          swString *T2);
40 
41 
42 PRIVATE float
43 StrEditCost(int       i,
44             int       j,
45             swString  *T1,
46             swString  *T2);
47 
48 
49 PRIVATE void
50 DeCode(char   *string,
51        int    k,
52        int    *tp,
53        float  *w);
54 
55 
56 PRIVATE int
57 decode(char *id);
58 
59 
60 PRIVATE void
61 encode(int  type,
62        char label[]);
63 
64 
65 PRIVATE int *alignment[2];       /* contains information from backtracking
66                                   *  alignment[0][n] is the node in tree2
67                                   *  matching node n in tree1               */
68 
69 
70 /*---------------------------------------------------------------------------*/
71 
72 PUBLIC float
string_edit_distance(swString * T1,swString * T2)73 string_edit_distance(swString *T1,
74                      swString *T2)
75 
76 {
77   float **distance;
78   short **i_point, **j_point;
79 
80   int   i, j, i1, j1, pos, length1, length2;
81   float minus, plus, change, temp;
82 
83   if (cost_matrix == 0)
84     EditCost = &UsualCost;
85   else
86     EditCost = &ShapiroCost;
87 
88   i_point = NULL;
89   j_point = NULL;
90   length1 = T1[0].sign;
91   length2 = T2[0].sign;
92 
93   distance = (float **)vrna_alloc((length1 + 1) * sizeof(float *));
94   if (edit_backtrack) {
95     i_point = (short **)vrna_alloc((length1 + 1) * sizeof(short *));
96     j_point = (short **)vrna_alloc((length1 + 1) * sizeof(short *));
97   }
98 
99   for (i = 0; i <= length1; i++) {
100     distance[i] = (float *)vrna_alloc((length2 + 1) * sizeof(float));
101     if (edit_backtrack) {
102       i_point[i]  = (short *)vrna_alloc((length2 + 1) * sizeof(short));
103       j_point[i]  = (short *)vrna_alloc((length2 + 1) * sizeof(short));
104     }
105   }
106 
107   for (i = 1; i <= length1; i++) {
108     if (edit_backtrack) {
109       i_point[i][0] = i - 1;
110       j_point[i][0] = 0;
111     }
112 
113     distance[i][0] = distance[i - 1][0] + StrEditCost(i, 0, T1, T2);
114   }
115   for (j = 1; j <= length2; j++) {
116     if (edit_backtrack) {
117       j_point[0][j] = j - 1;
118       i_point[0][j] = 0;
119     }
120 
121     distance[0][j] = distance[0][j - 1] + StrEditCost(0, j, T1, T2);
122   }
123 
124   for (i = 1; i <= length1; i++) {
125     for (j = 1; j <= length2; j++) {
126       minus   = distance[i - 1][j] + StrEditCost(i, 0, T1, T2);
127       plus    = distance[i][j - 1] + StrEditCost(0, j, T1, T2);
128       change  = distance[i - 1][j - 1] + StrEditCost(i, j, T1, T2);
129 
130       distance[i][j] = MIN3(minus, plus, change);
131       /* printf("%g ", distance[i][j]); */
132 
133       if (edit_backtrack) {
134         if (distance[i][j] == change) {
135           i_point[i][j] = i - 1;
136           j_point[i][j] = j - 1;
137         } else if (distance[i][j] == plus) {
138           i_point[i][j] = i;
139           j_point[i][j] = j - 1;
140         } else {
141           i_point[i][j] = i - 1;
142           j_point[i][j] = j;
143         }
144       }
145     }
146     /* printf("\n"); */
147   }
148   /* printf("\n"); */
149   temp = distance[length1][length2];
150   for (i = 0; i <= length1; i++)
151     free(distance[i]);
152   free(distance);
153 
154   if (edit_backtrack) {
155     if (alignment[0] != NULL)
156       free(alignment[0]);
157 
158     if (alignment[1] != NULL)
159       free(alignment[1]);
160 
161     alignment[0]  = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
162     alignment[1]  = (int *)vrna_alloc((length1 + length2 + 1) * sizeof(int));
163 
164     pos = length1 + length2;
165     i   = length1;
166     j   = length2;
167     while ((i > 0) || (j > 0)) {
168       i1  = i_point[i][j];
169       j1  = j_point[i][j];
170       if (((i - i1) == 1) && ((j - j1) == 1)) {
171         /* substitution    */
172         alignment[0][pos] = i;
173         alignment[1][pos] = j;
174       }
175 
176       if (((i - i1) == 1) && (j == j1)) {
177         /* Deletion in [1] */
178         alignment[0][pos] = i;
179         alignment[1][pos] = 0;
180       }
181 
182       if ((i == i1) && ((j - j1) == 1)) {
183         /* Deletion in [0] */
184         alignment[0][pos] = 0;
185         alignment[1][pos] = j;
186       }
187 
188       pos--;
189       i = i1;
190       j = j1;
191     }
192     for (i = pos + 1; i <= length1 + length2; i++) {
193       alignment[0][i - pos] = alignment[0][i];
194       alignment[1][i - pos] = alignment[1][i];
195     }
196     alignment[0][0] = length1 + length2 - pos;  /* length of alignment */
197 
198     for (i = 0; i <= length1; i++) {
199       free(i_point[i]);
200       free(j_point[i]);
201     }
202     free(i_point);
203     free(j_point);
204     sprint_aligned_swStrings(T1, T2);
205   }
206 
207   return temp;
208 }
209 
210 
211 /*---------------------------------------------------------------------------*/
212 
213 PRIVATE float
StrEditCost(int i,int j,swString * T1,swString * T2)214 StrEditCost(int       i,
215             int       j,
216             swString  *T1,
217             swString  *T2)
218 {
219   float c, diff, cd, min, a, b, dist;
220 
221   if (i == 0) {
222     cd    = (float)(*EditCost)[0][T2[j].type];
223     diff  = T2[j].weight;
224     dist  = cd * diff;
225   } else
226   if (j == 0) {
227     cd    = (float)(*EditCost)[T1[i].type][0];
228     diff  = T1[i].weight;
229     dist  = cd * diff;
230   } else
231   if (((T1[i].sign) * (T2[j].sign)) > 0) {
232     c     = (float)(*EditCost)[T1[i].type][T2[j].type];
233     diff  = (float)fabs((a = T1[i].weight) - (b = T2[j].weight));
234     min   = MIN2(a, b);
235     if (min == a)
236       cd = (float)(*EditCost)[0][T2[j].type];
237     else
238       cd = (float)(*EditCost)[T1[i].type][0];
239 
240     dist = c * min + cd * diff;
241   } else {
242     dist = (float)DIST_INF;
243   }
244 
245   return dist;
246 }
247 
248 
249 /*---------------------------------------------------------------------------*/
250 
251 PUBLIC swString *
Make_swString(char * string)252 Make_swString(char *string)
253 {
254   int       i = 0, j = 0, k = 0;
255   int       tp, len, l, length;
256   float     w;
257   swString  *S;
258 
259   length = strlen(string);
260 
261   for (i = 0; i < length; i++) {
262     if ((string[i] == '(') || (string[i] == ')'))
263       j++;
264 
265     if (string[i] == '.')
266       j += 2;
267   }
268 
269   len = j;
270 
271   S           = (swString *)vrna_alloc(sizeof(swString) * (len + 1));
272   S[0].sign   = j; /* number of entries */
273   S[0].weight = 0.0;
274   S[0].type   = 0;
275 
276   i = 0;
277   j = 1;
278   while (i < length) {
279     switch (string[i]) {
280       case '(':
281         S[j].sign = 1;
282         l         = 1;
283         k         = i;
284         while (l > 0) {
285           k++;
286           if (string[k] == '(')
287             l++;
288 
289           if (string[k] == ')')
290             l--;
291         }
292         DeCode(string, k, &tp, &w);
293         S[j].type   = tp;
294         S[j].weight = w / 2.0;
295         j++;
296         break;
297       case ')':
298         k         = i;
299         S[j].sign = -1;
300         DeCode(string, k, &tp, &w);
301         S[j].type   = tp;
302         S[j].weight = w / 2.0;
303         j++;
304         break;
305       case '.':
306         S[j].sign   = 1;
307         S[j].type   = 1;
308         S[j].weight = 0.5;
309         j++;
310         S[j].sign   = -1;
311         S[j].type   = 1;
312         S[j].weight = 0.5;
313         j++;
314         break;
315     }
316     i++;
317   }
318   return S;
319 }
320 
321 
322 /*---------------------------------------------------------------------------*/
323 
324 PRIVATE void
DeCode(char * string,int k,int * tp,float * w)325 DeCode(char   *string,
326        int    k,
327        int    *tp,
328        float  *w)
329 /* retrieves type and weigth for a node closed  by a bracket at position k */
330 {
331   int   i, j, l, m;
332   char  label[20], id[20];
333 
334   i         = k;
335   label[0]  = '\0';
336   while (i >= 0) {
337     i--;
338     if ((string[i] == '(') || (string[i] == ')') || (string[i] == '.')) {
339       break;
340     } else {
341       label[k - i - 1]  = string[i];
342       label[k - i]      = '\0';
343     }
344   }
345   l = strlen(label);
346   if (l == 0) {
347     /* Dot-Bracket notation */
348     *w  = 1.0;
349     *tp = 2;
350   } else {
351     for (i = 0; i < l; i++) {
352       if (!isalpha(label[l - i - 1]))
353         break;
354 
355       id[i] = label[l - i - 1];
356     }
357     id[i] = '\0';
358     *tp   = decode(id);
359     l     = l - i - 1;
360     if (l >= 0) {
361       for (j = 0; j <= l; j++)
362         id[j] = label[l - j];
363       label[l + 1]  = '\0';
364       m             = -1;
365       sscanf(label, "%d", &m);
366       *w = (float)m;
367       if (m == -1) {
368         vrna_message_warning("Non-integer weight in DeCode ignored");
369         *w = 1.0;
370       }
371     } else {
372       *w = 1.0;
373     }
374   }
375 }
376 
377 
378 /*---------------------------------------------------------------------------*/
379 
380 PRIVATE int
decode(char * id)381 decode(char *id)
382 {
383   int   n, quit, i;
384   char  label[100], *code;
385 
386   n = 0;
387 
388   quit  = 0;
389   code  = coding;
390 
391   while (!quit) {
392     for (i = 0; code[i] != sep; i++) {
393       if (code[i] == '\0') {
394         quit = 1;
395         break;
396       }
397 
398       label[i] = code[i];
399     }
400     label[i] = '\0';
401     if (strcmp(id, label) == 0)
402       return n;
403 
404     code += (i + 1);
405     n++;
406   }
407 
408   vrna_message_error("Syntax error: node identifier \"%s\" not found "
409                      "in coding string \"%s\"\n"
410                      "Exiting",
411                      id, coding);
412   exit(0);
413 }
414 
415 
416 /*---------------------------------------------------------------------------*/
417 
418 PRIVATE void
encode(int type,char label[])419 encode(int  type,
420        char label[])
421 
422 {
423   int i, l;
424 
425   l = 0;
426   for (i = 0; i < type; i++) {
427     while (coding[l] != sep && coding[l])
428       l++;
429     l++;
430   }
431 
432   for (i = 0; coding[l + i] != sep; i++) {
433     if (coding[l + i] == '\0')
434       break;
435 
436     label[i] = coding[l + i];
437   }
438   label[i] = '\0';
439 }
440 
441 
442 /*---------------------------------------------------------------------------*/
443 
444 PRIVATE void
sprint_aligned_swStrings(swString * T1,swString * T2)445 sprint_aligned_swStrings(swString *T1,
446                          swString *T2)
447 {
448   int   i, j, l0, l1, ltmp = 0, weights;
449   char  label[10], *a0, *a1, tmp0[20], tmp1[20];
450 
451   weights = 0;
452   for (i = 1; i <= T1[0].sign; i++)
453     weights = (weights || (T1[i].weight != 0.5));
454   for (i = 1; i <= T2[0].sign; i++)
455     weights = (weights || (T2[i].weight != 0.5));
456 
457   a0  = (char *)vrna_alloc(alignment[0][0] * 4 + 2);
458   a1  = (char *)vrna_alloc(alignment[0][0] * 4 + 2);
459   for (i = 1; i <= alignment[0][0]; i++) {
460     tmp0[0] = '\0';
461     l0      = 0;
462     if (alignment[0][i] > 0) {
463       encode(T1[alignment[0][i]].type, label);
464       if (T1[alignment[0][i]].sign > 0) {
465         tmp0[0] = '(';
466         tmp0[1] = '\0';
467       }
468 
469       strcat(tmp0, label);
470       if (weights)
471         sprintf(tmp0 + strlen(tmp0), "%d",
472                 (int)(2 * T1[alignment[0][i]].weight));
473 
474       if (T1[alignment[0][i]].sign < 0)
475         strcat(tmp0, ")");
476 
477       l0 = strlen(tmp0);
478     }
479 
480     tmp1[0] = '\0';
481     l1      = 0;
482     if (alignment[1][i] > 0) {
483       encode(T2[alignment[1][i]].type, label);
484       if (T2[alignment[1][i]].sign > 0) {
485         tmp1[0] = '(';
486         tmp1[1] = '\0';
487       }
488 
489       strcat(tmp1, label);
490       if (weights)
491         sprintf(tmp1 + strlen(tmp1), "%d",
492                 (int)(2 * T2[alignment[1][i]].weight));
493 
494       if (T2[alignment[1][i]].sign < 0)
495         strcat(tmp1, ")");
496 
497       l1 = strlen(tmp1);
498     }
499 
500     ltmp = MAX2(l0, l1);
501     for (j = l0; j < ltmp; j++)
502       tmp0[j] = '_';
503     for (j = l1; j < ltmp; j++)
504       tmp1[j] = '_';
505     tmp0[ltmp]  = '\0';
506     tmp1[ltmp]  = '\0';
507 
508     strcat(a0, tmp0);
509     strcat(a1, tmp1);
510     ltmp = strlen(a0);
511   }
512   if (aligned_line[0] != NULL) {
513     free(aligned_line[0]);
514     aligned_line[0] = NULL;
515   }
516 
517   if (aligned_line[1] != NULL) {
518     free(aligned_line[1]);
519     aligned_line[1] = NULL;
520   }
521 
522   aligned_line[0] = strdup(a0);
523   free(a0);
524   aligned_line[1] = strdup(a1);
525   free(a1);
526 }
527 
528 
529 /*---------------------------------------------------------------------------*/
530 
531 
532 PUBLIC void
print_swString(swString * x)533 print_swString(swString *x)
534 {
535   int i;
536 
537   for (i = 0; i <= x[0].sign; i++)
538     printf("(%d,%d,%f\n) ", x[i].type, x[i].sign, x[i].weight);
539   printf("\n");
540 }
541 
542 
543 /*---------------------------------------------------------------------------*/
544 
545 PUBLIC void
print_alignment_list(void)546 print_alignment_list(void)
547 {
548   int i;
549 
550   printf("\n");
551   for (i = 1; i <= alignment[0][0]; i++)
552     printf("%3d ", alignment[0][i]);
553   printf("\n");
554   for (i = 1; i <= alignment[0][0]; i++)
555     printf("%3d ", alignment[1][i]);
556   printf("\n");
557 }
558