1 #include <U2Core/disable-warnings.h>
2 U2_DISABLE_WARNINGS
3 
4 /*****************************************************************
5 * HMMER - Biological sequence analysis with profile HMMs
6 * Copyright (C) 1992-2003 Washington University School of Medicine
7 * All Rights Reserved
8 *
9 *     This source code is distributed under the terms of the
10 *     GNU General Public License. See the files COPYING and LICENSE
11 *     for details.
12 *****************************************************************/
13 
14 /* aligneval.c
15 *
16 * Comparison of multiple alignments. Three functions are
17 * provided, using subtly different scoring schemes:
18 *    CompareMultAlignments()    - basic scoring scheme
19 *    CompareRefMultAlignments() - only certain "canonical" columns
20 *                                 are scored
21 *
22 * The similarity measure is a fractional alignment identity averaged
23 * over all sequence pairs. The score for all pairs is:
24 *      (identically aligned symbols) / (total aligned columns in
25 *      known alignment)
26 *
27 * A column c is identically aligned for sequences i, j if:
28 *    1) both i,j have a symbol aligned in column c, and the
29 *       same pair of symbols is aligned somewhere in the test
30 *       alignment
31 *    2) S[i][c] is aligned to a gap in sequence j, and that symbol
32 *       is aligned to a gap in the test alignment
33 *    3) converse of 2)
34 *
35 *
36 * The algorithm is as follows:
37 *    1) For each known/test aligned pair of sequences (k1,k2 and t1,t2)
38 *        construct a list for each sequence, in which for every
39 *        counted symbol we record the raw index of the symbol in
40 *        the other sequence that it aligns to, or -1 if it aligns
41 *        to a gap or uncounted symbol.
42 *
43 *    2)  Compare the list for k1 to the list for t1 and count an identity
44 *        for each correct alignment.
45 *
46 *    3) Repeat 2) for comparing k2 to t2. Note that this means correct sym/sym
47 *       alignments count for 2; correct sym/gap alignments count for 1.
48 *
49 *    4) The score is (identities from 2 + identities from 3) /
50 *       (totals from 2 + totals from 3).
51 *
52 * Written originally for koala's ss2 pairwise alignment package.
53 *
54 * Sean Eddy, Sun Nov  1 12:45:11 1992
55 * SRE, Thu Jul 29 16:47:18 1993: major revision: all functions replaced by new algorithm
56 * CVS $Id: aligneval.c,v 1.9 2003/04/14 16:00:16 eddy Exp $
57 */
58 
59 #include "funcs.h"
60 
61 static int make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen);
62 static int make_ref_alilist(int *refcoords, char *k1, char *k2, char *s1, char *s2,
63                             int **ret_s1_list, int *ret_listlen);
64 static int compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc);
65 
66 
67 /* Function: ComparePairAlignments
68 *
69 * Purpose:  Calculate and return a number representing how well two different alignments
70 *           of a pair of sequences compare. The number is, roughly speaking,
71 *           the fraction of columns which are identically aligned.
72 *
73 *           For all columns c in which either known1[c] or known2[c]
74 *           is a non-gap, count an identity if those same symbols are
75 *           aligned somewhere in calc1/calc2. The score is identities/total
76 *           columns examined. (i.e. fully gapped columns don't count)
77 *
78 *           more explicitly, identities come from:
79 *             both known and test aligned pairs have the same symbol in the first sequence aligned to
80 *               a gap in the second sequence;
81 *             both known and test aligned pairs have the same symbol in the second sequence
82 *               aligned to a gap in the first sequence;
83 *             the known alignment has symbols aligned at this column, and the test
84 *               alignment aligns the same two symbols.
85 *
86 * Args:     known1, known2: trusted alignment of two sequences
87 *           calc1, calc2:   test alignment of two sequences
88 *
89 * Return:   Returns -1.0 on internal failure.
90 */
91 float
ComparePairAlignments(char * known1,char * known2,char * calc1,char * calc2)92 ComparePairAlignments(char *known1, char *known2, char *calc1, char *calc2)
93 {
94     int *klist1;
95     int *klist2;
96     int *tlist1;
97     int *tlist2;
98     int len1, len2;
99     float score;
100 
101     if (! make_alilist(calc1,  calc2,  &tlist1, &len1)) return -1.0;
102     if (! make_alilist(calc2,  calc1,  &tlist2, &len2)) return -1.0;
103     if (! make_alilist(known1, known2, &klist1, &len1)) return -1.0;
104     if (! make_alilist(known2, known1, &klist2, &len2)) return -1.0;
105     if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
106 
107     free(klist1);
108     free(klist2);
109     free(tlist1);
110     free(tlist2);
111     return score;
112 }
113 
114 
115 
116 /* Function: CompareRefPairAlignments()
117 *
118 * Same as above, but the only columns that count are the ones
119 * with indices in *refcoord. *refcoord and the known1, known2
120 * pair must be in sync with each other (come from the same
121 * multiple sequence alignment)
122 *
123 * Args:     ref           - 0..alen-1 array of 1 or 0
124 *           known1,known2 - trusted alignment
125 *           calc1, calc2  - test alignment
126 *
127 * Return:  the fractional alignment identity on success, -1.0 on failure.
128 */
129 float
CompareRefPairAlignments(int * ref,char * known1,char * known2,char * calc1,char * calc2)130 CompareRefPairAlignments(int  *ref, char *known1, char *known2, char *calc1, char *calc2)
131 {
132     int *klist1;
133     int *klist2;
134     int *tlist1;
135     int *tlist2;
136     int len1, len2;
137     float score;
138 
139     if (! make_ref_alilist(ref, known1, known2, calc1,  calc2,  &tlist1, &len1)) return -1.0;
140     if (! make_ref_alilist(ref, known2, known1, calc2,  calc1,  &tlist2, &len2)) return -1.0;
141     if (! make_ref_alilist(ref, known1, known2, known1, known2, &klist1, &len1)) return -1.0;
142     if (! make_ref_alilist(ref, known2, known1, known2, known1, &klist2, &len2)) return -1.0;
143     if (! compare_lists(klist1, klist2, tlist1, tlist2, len1, len2, &score)) return -1.0;
144 
145     free(klist1);
146     free(klist2);
147     free(tlist1);
148     free(tlist2);
149     return score;
150 }
151 
152 /* Function: make_alilist()
153 *
154 * Purpose:  Construct a list (array) mapping the raw symbols of s1
155 *           onto the indexes of the aligned symbols in s2 (or -1
156 *           for gaps in s2). The list (s1_list) will be of the
157 *           length of s1's raw sequence.
158 *
159 * Args:     s1          - sequence to construct the list for
160 *           s2          - sequence s1 is aligned to
161 *           ret_s1_list - RETURN: the constructed list (caller must free)
162 *           ret_listlen - RETURN: length of the list
163 *
164 * Returns:  1 on success, 0 on failure
165 */
166 static int
make_alilist(char * s1,char * s2,int ** ret_s1_list,int * ret_listlen)167 make_alilist(char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
168 {
169     int *s1_list;
170     int  col;         /* column position in alignment */
171     int  r1, r2;          /* raw symbol index at current col in s1, s2 */
172 
173   /* Malloc for s1_list. It can't be longer than s1 itself; we just malloc
174     * for that (and waste a wee bit of space)
175     */
176     s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
177     r1 = r2 = 0;
178     for (col = 0; s1[col] != '\0'; col++)
179     {
180         /* symbol in s1? Record what it's aligned to, and bump
181         * the r1 counter.
182         */
183         if (! isgap(s1[col]))
184         {
185             s1_list[r1] = isgap(s2[col]) ? -1 : r2;
186             r1++;
187         }
188 
189         /* symbol in s2? bump the r2 counter
190         */
191         if (! isgap(s2[col]))
192             r2++;
193     }
194 
195     *ret_listlen = r1;
196     *ret_s1_list = s1_list;
197     return 1;
198 }
199 
200 
201 
202 /* Function: make_ref_alilist()
203 *
204 * Purpose:  Construct a list (array) mapping the raw symbols of s1
205 *           which are under canonical columns of the ref alignment
206 *           onto the indexes of the aligned symbols in s2 (or -1
207 *           for gaps in s2 or noncanonical symbols in s2).
208 *
209 * Args:     ref:        - array of indices of canonical coords (1 canonical, 0 non)
210 *           k1          - s1's known alignment (w/ respect to refcoords)
211 *           k2          - s2's known alignment (w/ respect to refcoords)
212 *           s1          - sequence to construct the list for
213 *           s2          - sequence s1 is aligned to
214 *           ret_s1_list - RETURN: the constructed list (caller must free)
215 *           ret_listlen - RETURN: length of the list
216 *
217 * Returns:  1 on success, 0 on failure
218 */
219 /*ARGSUSED*/
220 static int
make_ref_alilist(int * ref,char * k1,char * k2,char * s1,char * s2,int ** ret_s1_list,int * ret_listlen)221 make_ref_alilist(int *ref, char *k1, char *k2,
222                  char *s1, char *s2, int **ret_s1_list, int *ret_listlen)
223 {
224     int *s1_list;
225     int  col;         /* column position in alignment */
226     int  r1, r2;          /* raw symbol index at current col in s1, s2 */
227     int *canons1;         /* flag array, 1 if position i in s1 raw seq is canonical */
228     int  lpos;            /* position in list */
229 
230     /* Allocations. No arrays can exceed the length of their
231     * appropriate parent (s1 or s2)
232     */
233     s1_list = (int *) MallocOrDie (sizeof(int) * strlen(s1));
234     canons1 = (int *) MallocOrDie (sizeof(int) * strlen(s1));
235 
236     /* First we use refcoords and k1,k2 to construct an array of 1's
237     * and 0's, telling us whether s1's raw symbol number i is countable.
238     * It's countable simply if it's under a canonical column.
239     */
240     r1 =  0;
241     for (col = 0; k1[col] != '\0'; col++)
242     {
243         if (! isgap(k1[col]))
244         {
245             canons1[r1] = ref[col] ? 1 : 0;
246             r1++;
247         }
248     }
249 
250     /* Now we can construct the list. We don't count pairs if the sym in s1
251     * is non-canonical.
252     * We have to keep separate track of our position in the list (lpos)
253     * from our positions in the raw sequences (r1,r2)
254     */
255     r1 = r2 = lpos = 0;
256     for (col = 0; s1[col] != '\0'; col++)
257     {
258         if (! isgap(s1[col]) && canons1[r1])
259         {
260             s1_list[lpos] = isgap(s2[col]) ? -1 : r2;
261             lpos++;
262         }
263 
264         if (! isgap(s1[col]))
265             r1++;
266         if (! isgap(s2[col]))
267             r2++;
268     }
269 
270     free(canons1);
271     *ret_listlen = lpos;
272     *ret_s1_list = s1_list;
273     return 1;
274 }
275 
276 /* Function: compare_lists()
277 *
278 * Purpose:  Given four alignment lists (k1,k2, t1,t2), calculate the
279 *           alignment score.
280 *
281 * Args:     k1   - list of k1's alignment to k2
282 *           k2   - list of k2's alignment to k1
283 *           t1   - list of t1's alignment to t2
284 *           t2   - list of t2's alignment to t2
285 *           len1 - length of k1, t1 lists (same by definition)
286 *           len2 - length of k2, t2 lists (same by definition)
287 *           ret_sc - RETURN: identity score of alignment
288 *
289 * Return:   1 on success, 0 on failure.
290 */
291 static int
compare_lists(int * k1,int * k2,int * t1,int * t2,int len1,int len2,float * ret_sc)292 compare_lists(int *k1, int *k2, int *t1, int *t2, int len1, int len2, float *ret_sc)
293 {
294     float id;
295     float tot;
296     int   i;
297 
298     id = tot = 0.0;
299     for (i = 0; i < len1; i++)
300     {
301         tot += 1.0;
302         if (t1[i] == k1[i]) id += 1.0;
303     }
304 
305     for ( i = 0; i < len2; i++)
306     {
307         tot += 1.0;
308         if (k2[i] == t2[i]) id += 1.0;
309     }
310 
311     *ret_sc = id / tot;
312     return 1;
313 }
314 
315 
316 /* Function: CompareMultAlignments
317 *
318 * Purpose:  Invokes pairwise alignment comparison for every possible pair,
319 *           and returns the average score over all N(N-1) of them or -1.0
320 *           on an internal failure.
321 *
322 *           Can be slow for large N, since it's quadratic.
323 *
324 * Args:     kseqs  - trusted multiple alignment
325 *           tseqs  - test multiple alignment
326 *           N      - number of sequences
327 *
328 * Return:   average identity score, or -1.0 on failure.
329 */
330 float
CompareMultAlignments(char ** kseqs,char ** tseqs,int N)331 CompareMultAlignments(char **kseqs, char **tseqs, int N)
332 {
333     int    i, j;          /* counters for sequences */
334     float  score;
335     float  tot_score = 0.0;
336     /* do all pairwise comparisons */
337     for (i = 0; i < N; i++)
338         for (j = i+1; j < N; j++)
339         {
340             score = ComparePairAlignments(kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
341             if (score < 0.0) return -1.0;
342             tot_score += score;
343         }
344         return ((tot_score * 2.0) / ((float) N * ((float) N - 1.0)));
345 }
346 
347 
348 
349 /* Function: CompareRefMultAlignments()
350 *
351 * Purpose:  Same as above, except an array of reference coords for
352 *           the canonical positions of the known alignment is also
353 *           provided.
354 *
355 * Args:     ref      : 0..alen-1 array of 1/0 flags, 1 if canon
356 *           kseqs    : trusted alignment
357 *           tseqs    : test alignment
358 *           N        : number of sequences
359 *
360 * Return:   average identity score, or -1.0 on failure
361 */
362 float
CompareRefMultAlignments(int * ref,char ** kseqs,char ** tseqs,int N)363 CompareRefMultAlignments(int   *ref, char **kseqs, char **tseqs, int N)
364 {
365     int    i, j;          /* counters for sequences */
366     float  score;
367     float  tot_score = 0.0;
368 
369     /* do all pairwise comparisons */
370     for (i = 0; i < N; i++)
371         for (j = i+1; j < N; j++)
372         {
373             score = CompareRefPairAlignments(ref, kseqs[i], kseqs[j], tseqs[i], tseqs[j]);
374             if (score < 0.0) return -1.0;
375             tot_score += score;
376         }
377         return ((tot_score * 2.0)/ ((float) N * ((float) N - 1.0)));
378 }
379 
380 /* Function: PairwiseIdentity()
381 *
382 * Purpose:  Calculate the pairwise fractional identity between
383 *           two aligned sequences s1 and s2. This is simply
384 *           (idents / MIN(len1, len2)).
385 *
386 *           Note how many ways there are to calculate pairwise identity,
387 *           because of the variety of choices for the denominator:
388 *           idents/(idents+mismat) has the disadvantage that artifactual
389 *             gappy alignments would have high "identities".
390 *           idents/(AVG|MAX)(len1,len2) both have the disadvantage that
391 *             alignments of fragments to longer sequences would have
392 *             artifactually low "identities".
393 *
394 *           Case sensitive; also, watch out in nucleic acid alignments;
395 *           U/T RNA/DNA alignments will be counted as mismatches!
396 */
397 float
PairwiseIdentity(char * s1,char * s2)398 PairwiseIdentity(char *s1, char *s2)
399 {
400     int     idents;       /* total identical positions  */
401     int     len1, len2;       /* lengths of seqs            */
402     int     x;            /* position in aligned seqs   */
403 
404     idents = len1 = len2 = 0;
405     for (x = 0; s1[x] != '\0' && s2[x] != '\0'; x++)
406     {
407         if (!isgap(s1[x])) {
408             len1++;
409             if (s1[x] == s2[x]) idents++;
410         }
411         if (!isgap(s2[x])) len2++;
412     }
413     if (len2 < len1) len1 = len2;
414     return (len1 == 0 ? 0.0 : (float) idents / (float) len1);
415 }
416 
417 
418 
419 /* Function: AlignmentIdentityBySampling()
420 * Date:     SRE, Mon Oct 19 14:29:01 1998 [St. Louis]
421 *
422 * Purpose:  Estimate and return the average pairwise
423 *           fractional identity of an alignment,
424 *           using sampling.
425 *
426 *           For use when there's so many sequences that
427 *           an all vs. all rigorous calculation will
428 *           take too long.
429 *
430 *           Case sensitive!
431 *
432 * Args:     aseq       - aligned sequences
433 *           L          - length of alignment
434 *           N          - number of seqs in alignment
435 *           nsample    - number of samples
436 *
437 * Returns:  average fractional identity, 0..1.
438 */
439 float
AlignmentIdentityBySampling(char ** aseq,int L,int N,int nsample)440 AlignmentIdentityBySampling(char **aseq, int L, int N, int nsample)
441 {
442     int x, i, j;          /* counters */
443     float sum;
444 
445     if (N < 2) return 1.0;
446 
447     sum = 0.;
448     for (x = 0; x < nsample; x++)
449     {
450         i = CHOOSE(N);
451         do { j = CHOOSE(N); } while (j == i); /* make sure j != i */
452         sum += PairwiseIdentity(aseq[i], aseq[j]);
453     }
454     return sum / (float) nsample;
455 }
456 
457 /* Function: MajorityRuleConsensus()
458 * Date:     SRE, Tue Mar  7 15:30:30 2000 [St. Louis]
459 *
460 * Purpose:  Given a set of aligned sequences, produce a
461 *           majority rule consensus sequence. If >50% nonalphabetic
462 *           (usually meaning gaps) in the column, ignore the column.
463 *
464 * Args:     aseq  - aligned sequences, [0..nseq-1][0..alen-1]
465 *           nseq  - number of sequences
466 *           alen  - length of alignment
467 *
468 * Returns:  ptr to allocated consensus sequence.
469 *           Caller is responsible for free'ing this.
470 */
471 char *
MajorityRuleConsensus(char ** aseq,int nseq,int alen)472 MajorityRuleConsensus(char **aseq, int nseq, int alen)
473 {
474     char *cs;                     /* RETURN: consensus sequence */
475     int count[27];        /* counts for a..z and gaps in a column */
476     int idx,apos;         /* counters for seq, column */
477     int spos;         /* position in cs */
478     int x;            /* counter for characters */
479     int sym;
480     int max, bestx;
481 
482     cs = (char*)MallocOrDie(sizeof(char) * (alen+1));
483 
484     for (spos=0,apos=0; apos < alen; apos++)
485     {
486         for (x = 0; x < 27; x++) count[x] = 0;
487 
488         for (idx = 0; idx < nseq; idx++)
489         {
490             if (isalpha((int) aseq[idx][apos])) {
491                 sym = toupper((int) aseq[idx][apos]);
492                 count[sym-'A']++;
493             } else {
494                 count[26]++;
495             }
496         }
497 
498         if ((float) count[26] / (float) nseq <= 0.5) {
499             max = bestx = -1;
500             for (x = 0; x < 26; x++)
501                 if (count[x] > max) { max = count[x]; bestx = x; }
502                 cs[spos++] = (char) ('A' + bestx);
503         }
504     }
505     cs[spos] = '\0';
506     return cs;
507 }
508 
509 /* Function: DealignedLength()
510 *
511 * Purpose:  Count the number of non-gap symbols in seq.
512 *           (i.e. find the length of the unaligned sequence)
513 *
514 * Args:     aseq - aligned sequence to count symbols in, \0 terminated
515 *
516 * Return:   raw length of seq.
517 */
518 int
DealignedLength(char * aseq)519 DealignedLength(char *aseq)
520 {
521     int rlen;
522     for (rlen = 0; *aseq; aseq++)
523         if (! isgap(*aseq)) rlen++;
524     return rlen;
525 }
526 
527 
528 
529 /* Function: MakeAlignedString()
530 *
531 * Purpose:  Given a raw string of some type (secondary structure, say),
532 *           align it to a given aseq by putting gaps wherever the
533 *           aseq has gaps.
534 *
535 * Args:     aseq:  template for alignment
536 *           alen:  length of aseq
537 *           ss:    raw string to align to aseq
538 *           ret_s: RETURN: aligned ss
539 *
540 * Return:   1 on success, 0 on failure (and squid_errno is set.)
541 *           ret_ss is MallocOrDie'ed here and must be free'd by caller.
542 */
543 int
MakeAlignedString(char * aseq,int alen,char * ss,char ** ret_s)544 MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
545 {
546     char *newS;
547     unsigned   apos, rpos;
548 
549     newS = (char *) MallocOrDie ((alen+1) * sizeof(char));
550     for (apos = rpos = 0; apos < alen; apos++)
551         if (! isgap(aseq[apos]))
552         {
553             newS[apos] = ss[rpos];
554             rpos++;
555         }
556         else
557             newS[apos] = '.';
558     newS[apos] = '\0';
559 
560     if (rpos != strlen(ss))
561     { free(newS); return 0; }
562     *ret_s = newS;
563     return 1;
564 }
565