1 /*****************************************************************
2  * SQUID - a library of functions for biological sequence analysis
3  * Copyright (C) 1992-2002 Washington University School of Medicine
4  *
5  *     This source code is freely distributed under the terms of the
6  *     GNU General Public License. See the files COPYRIGHT and LICENSE
7  *     for details.
8  *****************************************************************/
9 
10 /* msf.c
11  * SRE, Sun Jul 11 16:17:32 1993
12  *
13  * Import/export of GCG MSF multiple sequence alignment
14  * formatted files. Designed using format specifications
15  * kindly provided by Steve Smith of Genetics Computer Group.
16  *
17  * RCS $Id: msf.c 316 2016-12-16 16:14:39Z fabian $ (Original squid RCS Id: msf.c,v 1.4 2001/04/23 00:35:33 eddy Exp)
18  */
19 
20 #include <stdio.h>
21 #include <stdlib.h>
22 #include <string.h>
23 #include <ctype.h>
24 #include  <time.h>
25 #include "squid.h"
26 #include "msa.h"
27 
28 #ifdef TESTDRIVE_MSF
29 /*****************************************************************
30  * msf.c test driver:
31  * cc -DTESTDRIVE_MSF -g -O2 -Wall -o test msf.c msa.c gki.c sqerror.c sre_string.c file.c hsregex.c sre_math.c sre_ctype.c sqio.c alignio.c selex.c interleaved.c types.c -lm
32  *
33  */
34 int
main(int argc,char ** argv)35 main(int argc, char **argv)
36 {
37   MSAFILE *afp;
38   MSA     *msa;
39   char    *file;
40 
41   file = argv[1];
42 
43   if ((afp = MSAFileOpen(file, MSAFILE_STOCKHOLM, NULL)) == NULL)
44     Die("Couldn't open %s\n", file);
45 
46   while ((msa = ReadMSF(afp)) != NULL)
47     {
48       WriteMSF(stdout, msa);
49       MSAFree(msa);
50     }
51 
52   MSAFileClose(afp);
53   exit(0);
54 }
55 /******************************************************************/
56 #endif /* testdrive_msf */
57 
58 
59 
60 /* Function: ReadMSF()
61  * Date:     SRE, Tue Jun  1 08:07:22 1999 [St. Louis]
62  *
63  * Purpose:  Parse an alignment read from an open MSF format
64  *           alignment file. (MSF is a single-alignment format.)
65  *           Return the alignment, or NULL if we've already
66  *           read the alignment.
67  *
68  * Args:     afp  - open alignment file
69  *
70  * Returns:  MSA * - an alignment object
71  *                   caller responsible for an MSAFree()
72  *           NULL if no more alignments
73  *
74  * Diagnostics:
75  *           Will Die() here with a (potentially) useful message
76  *           if a parsing error occurs.
77  */
78 MSA *
ReadMSF(MSAFILE * afp)79 ReadMSF(MSAFILE *afp)
80 {
81   MSA    *msa;
82   char   *s;
83   int     alleged_alen = 0;
84   int     alleged_type = 0;
85   int     alleged_checksum = 0;
86   char   *tok;
87   char   *sp;
88   int     slen;
89   int     sqidx;
90   char   *name;
91   char   *seq;
92 
93   if (feof(afp->f)) return NULL;
94   if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
95 
96   /* The first line is the header.
97    * This is a new-ish GCG feature. Don't count on it, so
98    * we can be a bit more tolerant towards non-GCG software
99    * generating "MSF" files.
100    */
101   msa = MSAAlloc(10, 0);
102   if      (strncmp(s, "!!AA_MULTIPLE_ALIGNMENT", 23) == 0) {
103     msa->type = kAmino;
104     if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
105   } else if (strncmp(s, "!!NA_MULTIPLE_ALIGNMENT", 23) == 0) {
106     msa->type = kRNA;
107     if ((s = MSAFileGetLine(afp)) == NULL) return NULL;
108   }
109 
110   /* Now we're in the free text comment section of the MSF file.
111    * It ends when we see the "MSF: Type: Check: .." line.
112    * This line must be present.
113    */
114   do
115     {
116       if ((strstr(s, "..") != NULL && strstr(s, "MSF:") != NULL) &&
117 	  Strparse("^.+MSF: +([0-9]+) +Type: +([PNX]).+Check: +([0-9]+) +\\.\\.", s, 3))
118 	{
119 	  alleged_alen     = atoi(sqd_parse[0]);
120 	  switch (*(sqd_parse[1])) {
121 	  case 'N' : alleged_type = kRNA;      break;
122 	  case 'P' : alleged_type = kAmino;    break;
123 	  case 'X' : alleged_type = kOtherSeq; break;
124 	  default  : alleged_type = kOtherSeq;
125 	  }
126 	  alleged_checksum = atoi(sqd_parse[3]);
127 	  if (msa->type == kOtherSeq) msa->type = alleged_type;
128 	  break;		/* we're done with comment section. */
129 	}
130       if (! IsBlankline(s))
131 	MSAAddComment(msa, s);
132     } while ((s = MSAFileGetLine(afp)) != NULL);
133 
134   /* Now we're in the name section.
135    * GCG has a relatively poorly documented feature: only sequences that
136    * appear in this list will be read from the alignment section. Commenting
137    * out sequences in the name list (by preceding them with "!") is
138    * allowed as a means of manually defining subsets of sequences in
139    * the alignment section. We can support this feature reasonably
140    * easily because of the hash table for names in the MSA: we
141    * only add names to the hash table when we see 'em in the name section.
142    */
143   while ((s = MSAFileGetLine(afp)) != NULL)
144     {
145       while ((*s == ' ' || *s == '\t') && *s) s++; /* skip leading whitespace */
146 
147       if      (*s == '\n')   continue;                 /* skip blank lines */
148       else if (*s == '!')    MSAAddComment(msa, s);
149       else if ((sp  = strstr(s, "Name:")) != NULL)
150 	{
151 				/* We take the name and the weigh, and that's it */
152 	  sp   += 5;
153 	  tok   = sre_strtok(&sp, " \t", &slen); /* <sequence name> */
154 	  sqidx = GKIStoreKey(msa->index, tok);
155 	  if (sqidx >= msa->nseqalloc) MSAExpand(msa);
156 	  msa->sqname[sqidx] = sre_strdup(tok, slen);
157 	  msa->nseq++;
158 
159 	  if ((sp = strstr(sp, "Weight:")) == NULL)
160 	    Die("No Weight: on line %d for %s in name section of MSF file %s\n",
161 		afp->linenumber, msa->sqname[sqidx],  afp->fname);
162 	  sp += 7;
163 	  tok = sre_strtok(&sp, " \t", &slen);
164 	  msa->wgt[sqidx] = atof(tok);
165 	  msa->flags |= MSA_SET_WGT;
166 	}
167       else if (strncmp(s, "//", 2) == 0)
168 	break;
169       else
170 	{
171 	  Die("Invalid line (probably %d) in name section of MSF file %s:\n%s\n",
172 	      afp->linenumber, afp->fname, s);
173 	  squid_errno = SQERR_FORMAT; /* NOT THREADSAFE */
174 	  return NULL;
175 	}
176 
177     }
178 
179   /* And now we're in the sequence section.
180    * As discussed above, if we haven't seen a sequence name, then we
181    * don't include the sequence in the alignment.
182    * Also, watch out for coordinate-only lines.
183    */
184   while ((s = MSAFileGetLine(afp)) != NULL)
185     {
186       sp  = s;
187       if ((name = sre_strtok(&sp, " \t", NULL)) == NULL) continue;
188       if ((seq  = sre_strtok(&sp, "\n",  &slen)) == NULL) continue;
189 
190       /* The test for a coord line: digits starting both fields
191        */
192       if (isdigit(*name) && isdigit(*seq))
193 	continue;
194 
195       /* It's not blank, and it's not a coord line: must be sequence
196        */
197       sqidx = GKIKeyIndex(msa->index, name);
198       if (sqidx < 0) continue;	/* not a sequence we recognize */
199 
200       msa->sqlen[sqidx] = sre_strcat(&(msa->aseq[sqidx]), msa->sqlen[sqidx], seq, slen);
201     }
202 
203   /* We've left blanks in the aseqs; take them back out.
204    */
205   for (sqidx = 0; sqidx <  msa->nseq; sqidx++)
206     {
207       if (msa->aseq[sqidx] == NULL)
208 	Die("Didn't find a sequence for %s in MSF file %s\n", msa->sqname[sqidx], afp->fname);
209 
210       for (s = sp = msa->aseq[sqidx]; *s != '\0'; s++)
211 	{
212 	  if (*s == ' ' || *s == '\t') {
213 	    msa->sqlen[sqidx]--;
214 	  } else {
215 	    *sp = *s;
216 	    sp++;
217 	  }
218 	}
219       *sp = '\0';
220     }
221 
222   MSAVerifyParse(msa);		/* verifies, and also sets alen and wgt. */
223   return msa;
224 }
225 
226 
227 /* Function: WriteMSF()
228  * Date:     SRE, Mon May 31 11:25:18 1999 [St. Louis]
229  *
230  * Purpose:  Write an alignment in MSF format to an open file.
231  *
232  * Args:     fp    - file that's open for writing.
233  *           msa   - alignment to write.
234  *
235  *                   Note that msa->type, usually optional, must be
236  *                   set for WriteMSF to work. If it isn't, a fatal
237  *                   error is generated.
238  *
239  * Returns:  (void)
240  */
241 void
WriteMSF(FILE * fp,MSA * msa)242 WriteMSF(FILE *fp, MSA *msa)
243 {
244   time_t now;			/* current time as a time_t */
245   char   date[64];		/* today's date in GCG's format "October 3, 1996 15:57" */
246   char **gcg_aseq;              /* aligned sequences with gaps converted to GCG format */
247   char **gcg_sqname;		/* sequence names with GCG-valid character sets */
248   int    idx;			/* counter for sequences         */
249   char  *s;                     /* pointer into sqname or seq    */
250   int    len;			/* tmp variable for name lengths */
251   int    namelen;		/* maximum name length used      */
252   int    pos;			/* position counter              */
253   char   buffer[51];		/* buffer for writing seq        */
254   int    i;			/* another position counter */
255 
256   /*****************************************************************
257    * Make copies of sequence names and sequences.
258    *   GCG recommends that name characters should only contain
259    *   alphanumeric characters, -, or _
260    *   Some GCG and GCG-compatible software is sensitive to this.
261    *   We silently convert all other characters to '_'.
262    *
263    *   For sequences, GCG allows only ~ and . for gaps.
264    *   Otherwise, everthing is interpreted as a residue;
265    *   so squid's IUPAC-restricted chars are fine. ~ means
266    *   an external gap. . means an internal gap.
267    *****************************************************************/
268 
269 				/* make copies that we can edit */
270    gcg_aseq   = MallocOrDie(sizeof(char *) * msa->nseq);
271    gcg_sqname = MallocOrDie(sizeof(char *) * msa->nseq);
272    for (idx = 0; idx < msa->nseq; idx++)
273      {
274        gcg_aseq[idx]   = sre_strdup(msa->aseq[idx],   msa->alen);
275        gcg_sqname[idx] = sre_strdup(msa->sqname[idx], -1);
276      }
277 				/* alter names as needed  */
278    for (idx = 0; idx < msa->nseq; idx++)
279      for (s = gcg_sqname[idx]; *s != '\0'; s++)
280        if (! isalnum((int) *s) && *s != '-' && *s != '_')
281 	 *s = '_';
282 				/* alter gap chars in seq  */
283    for (idx = 0; idx < msa->nseq; idx++)
284      {
285        for (s = gcg_aseq[idx]; *s != '\0' && isgap(*s); s++)
286 	 *s = '~';
287        for (; *s != '\0'; s++)
288 	 if (isgap(*s)) *s = '.';
289        for (pos = msa->alen-1; pos > 0 && isgap(gcg_aseq[idx][pos]); pos--)
290 	 gcg_aseq[idx][pos] = '~';
291      }
292 				/* calculate max namelen used */
293   namelen = 0;
294   for (idx = 0; idx < msa->nseq; idx++)
295     if ((len = strlen(msa->sqname[idx])) > namelen)
296       namelen = len;
297 
298   /*****************************************************
299    * Write the MSF header
300    *****************************************************/
301 				/* required file type line */
302   if (msa->type == kOtherSeq)
303     msa->type = GuessAlignmentSeqtype(msa->aseq, msa->nseq);
304 
305   if      (msa->type == kRNA)   fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
306   else if (msa->type == kDNA)   fprintf(fp, "!!NA_MULTIPLE_ALIGNMENT 1.0\n");
307   else if (msa->type == kAmino) fprintf(fp, "!!AA_MULTIPLE_ALIGNMENT 1.0\n");
308   else if (msa->type == kOtherSeq)
309     Die("WriteMSF(): couldn't guess whether that alignment is RNA or protein.\n");
310   else
311     Die("Invalid sequence type %d in WriteMSF()\n", msa->type);
312 
313 				/* free text comments */
314   if (msa->ncomment > 0)
315     {
316       for (idx = 0; idx < msa->ncomment; idx++)
317 	fprintf(fp, "%s\n", msa->comment[idx]);
318       fprintf(fp, "\n");
319     }
320 				/* required checksum line */
321   now = time(NULL);
322   if (strftime(date, 64, "%B %d, %Y %H:%M", localtime(&now)) == 0)
323     Die("What time is it on earth? strftime() failed in WriteMSF().\n");
324   fprintf(fp, " %s  MSF: %d  Type: %c  %s  Check: %d  ..\n",
325 	  msa->name != NULL ? msa->name : "squid.msf",
326 	  msa->alen,
327 	  msa->type == kRNA ? 'N' : 'P',
328 	  date,
329 	  GCGMultchecksum(gcg_aseq, msa->nseq));
330   fprintf(fp, "\n");
331 
332   /*****************************************************
333    * Names/weights section
334    *****************************************************/
335 
336   for (idx = 0; idx < msa->nseq; idx++)
337     {
338       fprintf(fp, " Name: %-*.*s  Len:  %5d  Check: %4d  Weight: %.2f\n",
339 	      namelen, namelen,
340 	      gcg_sqname[idx],
341 	      msa->alen,
342 	      GCGchecksum(gcg_aseq[idx], msa->alen),
343 	      msa->wgt[idx]);
344     }
345   fprintf(fp, "\n");
346   fprintf(fp, "//\n");
347 
348   /*****************************************************
349    * Write the sequences
350    *****************************************************/
351 
352   for (pos = 0; pos < msa->alen; pos += 50)
353     {
354       fprintf(fp, "\n");	/* Blank line between sequence blocks */
355 
356 				/* Coordinate line */
357       len = (pos + 50) > msa->alen ? msa->alen - pos : 50;
358       if (len > 10)
359 	fprintf(fp, "%*s  %-6d%*s%6d\n", namelen, "",
360 		pos+1,
361 		len + ((len-1)/10) - 12, "",
362 		pos + len);
363       else
364 	fprintf(fp, "%*s  %-6d\n", namelen, "", pos+1);
365 
366       for (idx = 0; idx < msa->nseq; idx++)
367 	{
368 	  fprintf(fp, "%-*s ", namelen, gcg_sqname[idx]);
369 				/* get next line's worth of 50 from seq */
370 	  strncpy(buffer, gcg_aseq[idx] + pos, 50);
371 	  buffer[50] = '\0';
372 				/* draw the sequence line */
373 	  for (i = 0; i < len; i++)
374 	    {
375 	      if (! (i % 10)) fputc(' ', fp);
376 	      fputc(buffer[i], fp);
377 	    }
378 	  fputc('\n', fp);
379 	}
380     }
381 
382   Free2DArray((void **) gcg_aseq,   msa->nseq);
383   Free2DArray((void **) gcg_sqname, msa->nseq);
384   return;
385 }
386 
387 
388 
389