1 /* SQUID - A C function library for biological sequence analysis
2 * Copyright (C) 1992-1996 Sean R. Eddy
3 *
4 * This source code is distributed under terms of the
5 * GNU General Public License. See the files COPYING
6 * and GNULICENSE for further details.
7 *
8 */
9
10 /* alignio.c
11 * SRE, Mon Jul 12 11:57:37 1993
12 *
13 * Input/output of sequence alignments.
14 */
15
16 #include <stdio.h>
17 #include <stdlib.h>
18 #include <string.h>
19 #include "squid.h"
20
21 #ifdef MEMDEBUG
22 #include "dbmalloc.h"
23 #endif
24
25 /* Function: FreeAlignment()
26 *
27 * Purpose: Free the space allocated to alignment, names, and optional
28 * information.
29 *
30 * Args: aseqs - sequence alignment
31 * nseq - number of sequences
32 * ainfo - optional extra data. May be NULL.
33 */
34 void
FreeAlignment(char ** aseqs,int nseq,struct aliinfo_s * ainfo)35 FreeAlignment(char **aseqs, int nseq, struct aliinfo_s *ainfo)
36 {
37 int i;
38
39 for (i = 0; i < nseq; i++)
40 {
41 if (ainfo->sqinfo[i].flags & SQINFO_SS) free(ainfo->sqinfo[i].ss);
42 if (ainfo->sqinfo[i].flags & SQINFO_SA) free(ainfo->sqinfo[i].sa);
43 }
44 if (ainfo->flags & AINFO_CS) free(ainfo->cs);
45 if (ainfo->flags & AINFO_RF) free(ainfo->rf);
46 free(ainfo->sqinfo);
47 Free2DArray(aseqs, nseq);
48 }
49
50 /* Function: MakeAlignedString()
51 *
52 * Purpose: Given a raw string of some type (secondary structure, say),
53 * align it to a given aseq by putting gaps wherever the
54 * aseq has gaps.
55 *
56 * Args: aseq: template for alignment
57 * alen: length of aseq
58 * ss: raw string to align to aseq
59 * ret_s: RETURN: aligned ss
60 *
61 * Return: 1 on success, 0 on failure (and squid_errno is set.)
62 * ret_ss is malloc'ed here and must be free'd by caller.
63 */
64 int
MakeAlignedString(char * aseq,int alen,char * ss,char ** ret_s)65 MakeAlignedString(char *aseq, int alen, char *ss, char **ret_s)
66 {
67 char *new;
68 int apos, rpos;
69
70 if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL)
71 { squid_errno = SQERR_MEM; return 0; }
72 for (apos = rpos = 0; apos < alen; apos++)
73 if (! isgap(aseq[apos]))
74 {
75 new[apos] = ss[rpos];
76 rpos++;
77 }
78 else
79 new[apos] = '.';
80 new[apos] = '\0';
81
82 if (rpos != strlen(ss))
83 { squid_errno = SQERR_PARAMETER; free(new); return 0; }
84 *ret_s = new;
85 return 1;
86 }
87
88
89 /* Function: MakeDealignedString()
90 *
91 * Purpose: Given an aligned string of some type (either sequence or
92 * secondary structure, for instance), dealign it relative
93 * to a given aseq. Return a ptr to the new string.
94 *
95 * Args: aseq : template alignment
96 * alen : length of aseq
97 * ss: : string to make dealigned copy of; same length as aseq
98 * ret_s : RETURN: dealigned copy of ss
99 *
100 * Return: 1 on success, 0 on failure (and squid_errno is set)
101 * ret_s is alloc'ed here and must be freed by caller
102 */
103 int
MakeDealignedString(char * aseq,int alen,char * ss,char ** ret_s)104 MakeDealignedString(char *aseq, int alen, char *ss, char **ret_s)
105 {
106 char *new;
107 int apos, rpos;
108
109 if ((new = (char *) malloc ((alen+1) * sizeof(char))) == NULL)
110 { squid_errno = SQERR_MEM; return 0; }
111 for (apos = rpos = 0; apos < alen; apos++)
112 if (! isgap(aseq[apos]))
113 {
114 new[rpos] = ss[apos];
115 rpos++;
116 }
117 new[rpos] = '\0';
118 if (alen != strlen(ss))
119 { squid_errno = SQERR_PARAMETER; free(new); return 0; }
120 *ret_s = new;
121 return 1;
122 }
123
124
125
126 /* Function: WritePairwiseAlignment()
127 *
128 * Purpose: Write a nice formatted pairwise alignment out,
129 * with a BLAST-style middle line showing identities
130 * as themselves (single letter) and conservative
131 * changes as '+'.
132 *
133 * Args: ofp - open fp to write to (stdout, perhaps)
134 * aseq1, aseq2 - alignments to write (not necessarily
135 * flushed right with gaps)
136 * name1, name2 - names of sequences
137 * spos1, spos2 - starting position in each (raw) sequence
138 * pam - PAM matrix; positive values define
139 * conservative changes
140 * indent - how many extra spaces to print on left
141 *
142 * Return: 1 on success, 0 on failure
143 */
144 int
WritePairwiseAlignment(FILE * ofp,char * aseq1,char * name1,int spos1,char * aseq2,char * name2,int spos2,int ** pam,int indent)145 WritePairwiseAlignment(FILE *ofp,
146 char *aseq1, char *name1, int spos1,
147 char *aseq2, char *name2, int spos2,
148 int **pam, int indent)
149 {
150 char sname1[11]; /* shortened name */
151 char sname2[11];
152 int still_going; /* True if writing another block */
153 char buf1[61]; /* buffer for writing seq1; CPL+1*/
154 char bufmid[61]; /* buffer for writing consensus */
155 char buf2[61];
156 char *s1, *s2; /* ptrs into each sequence */
157 int count1, count2; /* number of symbols we're writing */
158 int rpos1, rpos2; /* position in raw seqs */
159 int rawcount1, rawcount2; /* number of nongap symbols written */
160 int apos;
161
162 strncpy(sname1, name1, 10);
163 sname1[10] = '\0';
164 strtok(sname1, WHITESPACE);
165
166 strncpy(sname2, name2, 10);
167 sname2[10] = '\0';
168 strtok(sname2, WHITESPACE);
169
170 s1 = aseq1;
171 s2 = aseq2;
172 rpos1 = spos1;
173 rpos2 = spos2;
174
175 still_going = True;
176 while (still_going)
177 {
178 still_going = False;
179
180 /* get next line's worth from both */
181 strncpy(buf1, s1, 60); buf1[60] = '\0';
182 strncpy(buf2, s2, 60); buf2[60] = '\0';
183 count1 = strlen(buf1);
184 count2 = strlen(buf2);
185
186 /* is there still more to go? */
187 if ((count1 == 60 && s1[60] != '\0') ||
188 (count2 == 60 && s2[60] != '\0'))
189 still_going = True;
190
191 /* shift seq ptrs by a line */
192 s1 += count1;
193 s2 += count2;
194
195 /* assemble the consensus line */
196 for (apos = 0; apos < count1 && apos < count2; apos++)
197 {
198 if (!isgap(buf1[apos]) && !isgap(buf2[apos]))
199 {
200 if (buf1[apos] == buf2[apos])
201 bufmid[apos] = buf1[apos];
202 else if (pam[buf1[apos] - 'A'][buf2[apos] - 'A'] > 0)
203 bufmid[apos] = '+';
204 else
205 bufmid[apos] = ' ';
206 }
207 else
208 bufmid[apos] = ' ';
209 }
210 bufmid[apos] = '\0';
211
212 rawcount1 = 0;
213 for (apos = 0; apos < count1; apos++)
214 if (!isgap(buf1[apos])) rawcount1++;
215
216 rawcount2 = 0;
217 for (apos = 0; apos < count2; apos++)
218 if (!isgap(buf2[apos])) rawcount2++;
219
220 (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
221 sname1, rpos1, buf1, rpos1 + rawcount1 -1);
222 (void) fprintf(ofp, "%*s %s\n", indent, "",
223 bufmid);
224 (void) fprintf(ofp, "%*s%-10.10s %5d %s %5d\n", indent, "",
225 sname2, rpos2, buf2, rpos2 + rawcount2 -1);
226 (void) fprintf(ofp, "\n");
227
228 rpos1 += rawcount1;
229 rpos2 += rawcount2;
230 }
231
232 return 1;
233 }
234
235
236 /* Function: MingapAlignment()
237 *
238 * Purpose: Remove all-gap columns from a multiple sequence alignment
239 * and its associated data. The alignment is assumed to be
240 * flushed (all aseqs the same length).
241 */
242 int
MingapAlignment(char ** aseqs,int num,struct aliinfo_s * ainfo)243 MingapAlignment(char **aseqs, int num, struct aliinfo_s *ainfo)
244 {
245 int apos; /* position in original alignment */
246 int mpos; /* position in new alignment */
247 int idx;
248
249 /* We overwrite aseqs, using its allocated memory.
250 */
251 for (apos = 0, mpos = 0; aseqs[0][apos] != '\0'; apos++)
252 {
253 /* check for all-gap in column */
254 for (idx = 0; idx < num; idx++)
255 if (! isgap(aseqs[idx][apos]))
256 break;
257 if (idx == num) continue;
258
259 /* shift alignment and ainfo */
260 if (mpos != apos)
261 {
262 for (idx = 0; idx < num; idx++)
263 aseqs[idx][mpos] = aseqs[idx][apos];
264
265 if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = ainfo->cs[apos];
266 if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = ainfo->rf[apos];
267 }
268 mpos++;
269 }
270 /* null terminate everything */
271 for (idx = 0; idx < num; idx++)
272 aseqs[idx][mpos] = '\0';
273 ainfo->alen = mpos; /* set new length */
274 if (ainfo->flags & AINFO_CS) ainfo->cs[mpos] = '\0';
275 if (ainfo->flags & AINFO_RF) ainfo->rf[mpos] = '\0';
276 return 1;
277 }
278
279
280
281 /* Function: RandomAlignment()
282 *
283 * Purpose: Create a random alignment from raw sequences.
284 *
285 * Ideally, we would like to sample an alignment from the
286 * space of possible alignments according to its probability,
287 * given a prior probability distribution for alignments.
288 * I don't see how to describe such a distribution, let alone
289 * sample it.
290 *
291 * This is a rough approximation that tries to capture some
292 * desired properties. We assume the alignment is generated
293 * by a simple HMM composed of match and insert states.
294 * Given parameters (pop, pex) for the probability of opening
295 * and extending an insertion, we can find the expected number
296 * of match states, M, in the underlying model for each sequence.
297 * We use an average M taken over all the sequences (this is
298 * an approximation. The expectation of M given all the sequence
299 * lengths is a nasty-looking summation.)
300 *
301 * M = len / ( 1 + pop ( 1 + 1/ (1-pex) ) )
302 *
303 * Then, we assign positions in each raw sequence onto the M match
304 * states and M+1 insert states of this "HMM", by rolling random
305 * numbers and inserting the (rlen-M) inserted positions randomly
306 * into the insert slots, taking into account the relative probability
307 * of open vs. extend.
308 *
309 * The resulting alignment has two desired properties: insertions
310 * tend to follow the HMM-like exponential distribution, and
311 * the "sparseness" of the alignment is controllable through
312 * pop and pex.
313 *
314 * Args: rseqs - raw sequences to "align", 0..nseq-1
315 * sqinfo - array of 0..nseq-1 info structures for the sequences
316 * nseq - number of sequences
317 * pop - probability to open insertion (0<pop<1)
318 * pex - probability to extend insertion (0<pex<1)
319 * ret_aseqs - RETURN: alignment (flushed)
320 * ainfo - fill in: alignment info
321 *
322 * Return: 1 on success, 0 on failure. Sets squid_errno to indicate cause
323 * of failure.
324 */
325 int
RandomAlignment(char ** rseqs,SQINFO * sqinfo,int nseq,float pop,float pex,char *** ret_aseqs,AINFO * ainfo)326 RandomAlignment(char **rseqs, SQINFO *sqinfo, int nseq, float pop, float pex,
327 char ***ret_aseqs, AINFO *ainfo)
328 {
329 char **aseqs; /* RETURN: alignment */
330 int alen; /* length of alignment */
331 int *rlen; /* lengths of each raw sequence */
332 int M; /* length of "model" */
333 int **ins; /* insertion counts, 0..nseq-1 by 0..M */
334 int *master_ins; /* max insertion counts, 0..M */
335 int apos, rpos, idx;
336 int statepos;
337 int count;
338 int minlen;
339
340 /* calculate expected length of model, M
341 */
342 if ((rlen = (int *) malloc (sizeof(int) * nseq)) == NULL)
343 { squid_errno = SQERR_MEM; return 0; }
344 M = 0;
345 minlen = 9999999;
346 for (idx = 0; idx < nseq; idx++)
347 {
348 rlen[idx] = strlen(rseqs[idx]);
349 M += rlen[idx];
350 minlen = (rlen[idx] < minlen) ? rlen[idx] : minlen;
351 }
352 M = (int) ((float) M / (1.0 + pop * (1.0 + 1.0 / (1.0 - pex))));
353 M /= nseq;
354 if (M > minlen) M = minlen;
355
356 /* make arrays that count insertions in M+1 possible insert states
357 */
358 if ((ins = (int **) malloc (sizeof(int *) * nseq)) == NULL ||
359 (master_ins = (int *) malloc (sizeof(int) * (M+1))) == NULL)
360 { squid_errno = SQERR_MEM; return 0; }
361 for (idx = 0; idx < nseq; idx++)
362 {
363 if ((ins[idx] = (int *) malloc (sizeof(int) * (M+1))) == NULL)
364 { squid_errno = SQERR_MEM; return 0; }
365 for (rpos = 0; rpos <= M; rpos++)
366 ins[idx][rpos] = 0;
367 }
368 /* normalize */
369 pop = pop / (pop+pex);
370 pex = 1.0 - pop;
371 /* make insertions for individual sequences */
372 for (idx = 0; idx < nseq; idx++)
373 {
374 apos = -1;
375 for (rpos = 0; rpos < rlen[idx]-M; rpos++)
376 {
377 if (sre_random() < pop || apos == -1) /* open insertion */
378 apos = CHOOSE(M+1); /* choose 0..M */
379 ins[idx][apos]++;
380 }
381 }
382 /* calculate master_ins, max inserts */
383 alen = M;
384 for (apos = 0; apos <= M; apos++)
385 {
386 master_ins[apos] = 0;
387 for (idx = 0; idx < nseq; idx++)
388 if (ins[idx][apos] > master_ins[apos])
389 master_ins[apos] = ins[idx][apos];
390 alen += master_ins[apos];
391 }
392
393
394 /* Now, construct alignment
395 */
396 if ((aseqs = (char **) malloc (sizeof (char *) * nseq)) == NULL)
397 { squid_errno = SQERR_MEM; return 0; }
398 for (idx = 0; idx < nseq; idx++)
399 if ((aseqs[idx] = (char *) malloc (sizeof(char) * (alen+1))) == NULL)
400 { squid_errno = SQERR_MEM; return 0; }
401
402 for (idx = 0; idx < nseq; idx++)
403 {
404 apos = rpos = 0;
405
406 for (statepos = 0; statepos <= M; statepos++)
407 {
408 for (count = 0; count < ins[idx][statepos]; count++)
409 aseqs[idx][apos++] = rseqs[idx][rpos++];
410 for (; count < master_ins[statepos]; count++)
411 aseqs[idx][apos++] = ' ';
412
413 if (statepos != M)
414 aseqs[idx][apos++] = rseqs[idx][rpos++];
415 }
416 aseqs[idx][alen] = '\0';
417 }
418 ainfo->flags = 0;
419 ainfo->alen = alen; ainfo->flags |= AINFO_ALEN;
420
421 if ((ainfo->sqinfo = (SQINFO *) malloc (sizeof(SQINFO) * nseq)) == NULL)
422 Die("malloc failed");
423 for (idx = 0; idx < nseq; idx++)
424 SeqinfoCopy(&(ainfo->sqinfo[idx]), &(sqinfo[idx]));
425
426 free(rlen);
427 free(master_ins);
428 Free2DArray(ins, nseq);
429 *ret_aseqs = aseqs;
430 return 1;
431 }
432