1 /* SQUID - A C function library for biological sequence analysis
2  * Copyright (C) 1992-1996 Sean R. Eddy
3  *
4  *    This source code is distributed under terms of the
5  *    GNU General Public License. See the files COPYING
6  *    and GNULICENSE for further details.
7  *
8  */
9 
10 /* alignio.c
11  * SRE, Mon Jul 12 11:57:37 1993
12  *
13  * Input/output of sequence alignments.
14  */
15 
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include "squid.h"
20 
21 #ifdef MEMDEBUG
22 #include "dbmalloc.h"
23 #endif
24 
25 /* Function: FreeAlignment()
26  *
27  * Purpose:  Free the space allocated to alignment, names, and optional
28  *           information.
29  *
30  * Args:     aseqs - sequence alignment
31  *           nseq  - number of sequences
32  *           ainfo - optional extra data. May be NULL.
33  */
34 void
FreeAlignment(char ** aseqs,int nseq,struct aliinfo_s * ainfo)35 FreeAlignment(char **aseqs, int nseq, struct aliinfo_s *ainfo)
36 {
37   int i;
38 
39   for (i = 0; i < nseq; i++)
40     {
41       if (ainfo->sqinfo[i].flags & SQINFO_SS)   free(ainfo->sqinfo[i].ss);
42       if (ainfo->sqinfo[i].flags & SQINFO_SA)   free(ainfo->sqinfo[i].sa);
43     }
44   if (ainfo->flags & AINFO_CS) free(ainfo->cs);
45   if (ainfo->flags & AINFO_RF) free(ainfo->rf);
46   free(ainfo->sqinfo);
47   Free2DArray(aseqs, nseq);
48 }
49 
50 /* Function: MakeAlignedString()
51  *
52  * Purpose:  Given a raw string of some type (secondary structure, say),
53  *           align it to a given aseq by putting gaps wherever the
54  *           aseq has gaps.
55  *
56  * Args:     aseq:  template for alignment
57  *           alen:  length of aseq
58  *           ss:    raw string to align to aseq
59  *           ret_s: RETURN: aligned ss
60  *
61  * Return:   1 on success, 0 on failure (and squid_errno is set.)
62  *           ret_ss is malloc'ed here and must be free'd by caller.
63  */
64 int
MakeAlignedString(char * aseq,int alen,char * ss,char ** ret_s)65 MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
66 {
67   char *new;
68   int   apos, rpos;
69 
70   if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL)
71     { squid_errno = SQERR_MEM; return 0; }
72   for (apos = rpos = 0; apos < alen; apos++)
73     if (! isgap(aseq[apos]))
74       {
75 	new[apos] = ss[rpos];
76 	rpos++;
77       }
78     else
79       new[apos] = '.';
80   new[apos] = '\0';
81 
82   if (rpos != strlen(ss))
83     { squid_errno = SQERR_PARAMETER; free(new); return 0; }
84   *ret_s = new;
85   return 1;
86 }
87 
88 
89 /* Function: MakeDealignedString()
90  *
91  * Purpose:  Given an aligned string of some type (either sequence or
92  *           secondary structure, for instance), dealign it relative
93  *           to a given aseq. Return a ptr to the new string.
94  *
95  * Args:     aseq  : template alignment
96  *           alen  : length of aseq
97  *           ss:   : string to make dealigned copy of; same length as aseq
98  *           ret_s : RETURN: dealigned copy of ss
99  *
100  * Return:   1 on success, 0 on failure (and squid_errno is set)
101  *           ret_s is alloc'ed here and must be freed by caller
102  */
103 int
MakeDealignedString(char * aseq,int alen,char * ss,char ** ret_s)104 MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
105 {
106   char *new;
107   int   apos, rpos;
108 
109   if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL)
110     { squid_errno = SQERR_MEM; return 0; }
111   for (apos = rpos = 0; apos < alen; apos++)
112     if (! isgap(aseq[apos]))
113       {
114 	new[rpos] = ss[apos];
115 	rpos++;
116       }
117   new[rpos] = '\0';
118   if (alen != strlen(ss))
119     { squid_errno = SQERR_PARAMETER; free(new); return 0; }
120   *ret_s = new;
121   return 1;
122 }
123 
124 
125 
126 /* Function: WritePairwiseAlignment()
127  *
128  * Purpose:  Write a nice formatted pairwise alignment out,
129  *           with a BLAST-style middle line showing identities
130  *           as themselves (single letter) and conservative
131  *           changes as '+'.
132  *
133  * Args:     ofp          - open fp to write to (stdout, perhaps)
134  *           aseq1, aseq2 - alignments to write (not necessarily
135  *                          flushed right with gaps)
136  *           name1, name2 - names of sequences
137  *           spos1, spos2 - starting position in each (raw) sequence
138  *           pam          - PAM matrix; positive values define
139  *                          conservative changes
140  *           indent       - how many extra spaces to print on left
141  *
142  * Return:  1 on success, 0 on failure
143  */
144 int
WritePairwiseAlignment(FILE * ofp,char * aseq1,char * name1,int spos1,char * aseq2,char * name2,int spos2,int ** pam,int indent)145 WritePairwiseAlignment(FILE *ofp,
146 		       char *aseq1, char *name1, int spos1,
147 		       char *aseq2, char *name2, int spos2,
148 		       int **pam, int indent)
149 {
150   char sname1[11];              /* shortened name               */
151   char sname2[11];
152   int  still_going;		/* True if writing another block */
153   char buf1[61];		/* buffer for writing seq1; CPL+1*/
154   char bufmid[61];              /* buffer for writing consensus  */
155   char buf2[61];
156   char *s1, *s2;                /* ptrs into each sequence          */
157   int  count1, count2;		/* number of symbols we're writing  */
158   int  rpos1, rpos2;		/* position in raw seqs             */
159   int  rawcount1, rawcount2;	/* number of nongap symbols written */
160   int  apos;
161 
162   strncpy(sname1, name1, 10);
163   sname1[10] = '\0';
164   strtok(sname1, WHITESPACE);
165 
166   strncpy(sname2, name2, 10);
167   sname2[10] = '\0';
168   strtok(sname2, WHITESPACE);
169 
170   s1 = aseq1;
171   s2 = aseq2;
172   rpos1 = spos1;
173   rpos2 = spos2;
174 
175   still_going = True;
176   while (still_going)
177     {
178       still_going = False;
179 
180 				/* get next line's worth from both */
181       strncpy(buf1, s1, 60); buf1[60] = '\0';
182       strncpy(buf2, s2, 60); buf2[60] = '\0';
183       count1 = strlen(buf1);
184       count2 = strlen(buf2);
185 
186 				/* is there still more to go? */
187       if ((count1 == 60 && s1[60] != '\0') ||
188 	  (count2 == 60 && s2[60] != '\0'))
189 	still_going = True;
190 
191 				/* shift seq ptrs by a line */
192       s1 += count1;
193       s2 += count2;
194 
195 				/* assemble the consensus line */
196       for (apos = 0; apos < count1 && apos < count2; apos++)
197 	{
198 	  if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
199 	    {
200 	      if (buf1[apos] == buf2[apos])
201 		bufmid[apos] = buf1[apos];
202 	      else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
203 		bufmid[apos] = '+';
204 	      else
205 		bufmid[apos] = ' ';
206 	    }
207 	  else
208 	    bufmid[apos] = ' ';
209 	}
210       bufmid[apos] = '\0';
211 
212       rawcount1 = 0;
213       for (apos = 0; apos < count1; apos++)
214 	if (!isgap(buf1[apos])) rawcount1++;
215 
216       rawcount2 = 0;
217       for (apos = 0; apos < count2; apos++)
218 	if (!isgap(buf2[apos])) rawcount2++;
219 
220       (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
221 		     sname1, rpos1, buf1, rpos1 + rawcount1 -1);
222       (void) fprintf(ofp, "%*s                 %s\n", indent, "",
223 		     bufmid);
224       (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
225 		     sname2, rpos2, buf2, rpos2 + rawcount2 -1);
226       (void) fprintf(ofp, "\n");
227 
228       rpos1 += rawcount1;
229       rpos2 += rawcount2;
230     }
231 
232   return 1;
233 }
234 
235 
236 /* Function: MingapAlignment()
237  *
238  * Purpose:  Remove all-gap columns from a multiple sequence alignment
239  *           and its associated data. The alignment is assumed to be
240  *           flushed (all aseqs the same length).
241  */
242 int
MingapAlignment(char ** aseqs,int num,struct aliinfo_s * ainfo)243 MingapAlignment(char **aseqs, int num, struct aliinfo_s *ainfo)
244 {
245   int apos;			/* position in original alignment */
246   int mpos;			/* position in new alignment      */
247   int idx;
248 
249   /* We overwrite aseqs, using its allocated memory.
250    */
251   for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
252     {
253 				/* check for all-gap in column */
254       for (idx = 0; idx < num; idx++)
255 	if (! isgap(aseqs[idx][apos]))
256 	  break;
257       if (idx == num) continue;
258 
259 				/* shift alignment and ainfo */
260       if (mpos != apos)
261 	{
262 	  for (idx = 0; idx < num; idx++)
263 	    aseqs[idx][mpos] = aseqs[idx][apos];
264 
265 	  if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = ainfo->cs[apos];
266 	  if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = ainfo->rf[apos];
267 	}
268       mpos++;
269     }
270 				/* null terminate everything */
271   for (idx = 0; idx < num; idx++)
272     aseqs[idx][mpos] = '\0';
273   ainfo->alen = mpos;	/* set new length */
274   if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = '\0';
275   if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = '\0';
276   return 1;
277 }
278 
279 
280 
281 /* Function: RandomAlignment()
282  *
283  * Purpose:  Create a random alignment from raw sequences.
284  *
285  *           Ideally, we would like to sample an alignment from the
286  *           space of possible alignments according to its probability,
287  *           given a prior probability distribution for alignments.
288  *           I don't see how to describe such a distribution, let alone
289  *           sample it.
290  *
291  *           This is a rough approximation that tries to capture some
292  *           desired properties. We assume the alignment is generated
293  *           by a simple HMM composed of match and insert states.
294  *           Given parameters (pop, pex) for the probability of opening
295  *           and extending an insertion, we can find the expected number
296  *           of match states, M, in the underlying model for each sequence.
297  *           We use an average M taken over all the sequences (this is
298  *           an approximation. The expectation of M given all the sequence
299  *           lengths is a nasty-looking summation.)
300  *
301  *           M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
302  *
303  *           Then, we assign positions in each raw sequence onto the M match
304  *           states and M+1 insert states of this "HMM", by rolling random
305  *           numbers and inserting the (rlen-M) inserted positions randomly
306  *           into the insert slots, taking into account the relative probability
307  *           of open vs. extend.
308  *
309  *           The resulting alignment has two desired properties: insertions
310  *           tend to follow the HMM-like exponential distribution, and
311  *           the "sparseness" of the alignment is controllable through
312  *           pop and pex.
313  *
314  * Args:     rseqs     - raw sequences to "align", 0..nseq-1
315  *           sqinfo    - array of 0..nseq-1 info structures for the sequences
316  *           nseq      - number of sequences
317  *           pop       - probability to open insertion (0<pop<1)
318  *           pex       - probability to extend insertion (0<pex<1)
319  *           ret_aseqs - RETURN: alignment (flushed)
320  *           ainfo     - fill in: alignment info
321  *
322  * Return:   1 on success, 0 on failure. Sets squid_errno to indicate cause
323  *           of failure.
324  */
325 int
RandomAlignment(char ** rseqs,SQINFO * sqinfo,int nseq,float pop,float pex,char *** ret_aseqs,AINFO * ainfo)326 RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
327 		char ***ret_aseqs, AINFO *ainfo)
328 {
329   char **aseqs;                 /* RETURN: alignment   */
330   int    alen;			/* length of alignment */
331   int   *rlen;                  /* lengths of each raw sequence */
332   int    M;			/* length of "model"   */
333   int  **ins;                   /* insertion counts, 0..nseq-1 by 0..M */
334   int   *master_ins;            /* max insertion counts, 0..M */
335   int    apos, rpos, idx;
336   int    statepos;
337   int    count;
338   int    minlen;
339 
340   /* calculate expected length of model, M
341    */
342   if ((rlen = (int *) malloc (sizeof(int) * nseq)) == NULL)
343     { squid_errno = SQERR_MEM; return 0; }
344   M = 0;
345   minlen = 9999999;
346   for (idx = 0; idx < nseq; idx++)
347     {
348       rlen[idx] = strlen(rseqs[idx]);
349       M += rlen[idx];
350       minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
351     }
352   M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
353   M /= nseq;
354   if (M > minlen) M = minlen;
355 
356   /* make arrays that count insertions in M+1 possible insert states
357    */
358   if ((ins = (int **) malloc (sizeof(int *) * nseq)) == NULL ||
359       (master_ins = (int *) malloc (sizeof(int) * (M+1))) == NULL)
360     { squid_errno = SQERR_MEM; return 0; }
361   for (idx = 0; idx < nseq; idx++)
362     {
363       if ((ins[idx] = (int *) malloc (sizeof(int) * (M+1))) == NULL)
364 	{ squid_errno = SQERR_MEM; return 0; }
365       for (rpos = 0; rpos <= M; rpos++)
366 	ins[idx][rpos] = 0;
367     }
368 				/* normalize */
369   pop = pop / (pop+pex);
370   pex = 1.0 - pop;
371 				/* make insertions for individual sequences */
372   for (idx = 0; idx < nseq; idx++)
373     {
374       apos = -1;
375       for (rpos = 0; rpos < rlen[idx]-M; rpos++)
376 	{
377 	  if (sre_random() < pop || apos == -1)	/* open insertion */
378 	    apos = CHOOSE(M+1);        /* choose 0..M */
379 	  ins[idx][apos]++;
380 	}
381     }
382 				/* calculate master_ins, max inserts */
383   alen = M;
384   for (apos = 0; apos <= M; apos++)
385     {
386       master_ins[apos] = 0;
387       for (idx = 0; idx < nseq; idx++)
388 	if (ins[idx][apos] > master_ins[apos])
389 	  master_ins[apos] = ins[idx][apos];
390       alen += master_ins[apos];
391     }
392 
393 
394   /* Now, construct alignment
395    */
396   if ((aseqs = (char **) malloc (sizeof (char *) * nseq)) == NULL)
397     { squid_errno = SQERR_MEM; return 0; }
398   for (idx = 0; idx < nseq; idx++)
399     if ((aseqs[idx] = (char *) malloc (sizeof(char) * (alen+1))) == NULL)
400       { squid_errno = SQERR_MEM; return 0; }
401 
402   for (idx = 0; idx < nseq; idx++)
403     {
404       apos = rpos = 0;
405 
406       for (statepos = 0; statepos <= M; statepos++)
407 	{
408 	  for (count = 0; count < ins[idx][statepos]; count++)
409 	    aseqs[idx][apos++] = rseqs[idx][rpos++];
410 	  for (; count < master_ins[statepos]; count++)
411 	    aseqs[idx][apos++] = ' ';
412 
413 	  if (statepos != M)
414 	    aseqs[idx][apos++] = rseqs[idx][rpos++];
415 	}
416       aseqs[idx][alen] = '\0';
417     }
418   ainfo->flags = 0;
419   ainfo->alen  = alen; ainfo->flags |= AINFO_ALEN;
420 
421   if ((ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO) * nseq)) == NULL)
422     Die("malloc failed");
423   for (idx = 0; idx < nseq; idx++)
424     SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
425 
426   free(rlen);
427   free(master_ins);
428   Free2DArray(ins, nseq);
429   *ret_aseqs = aseqs;
430   return 1;
431 }
432