1 /***************************************************************** 2 * SQUID - a library of functions for biological sequence analysis 3 * Copyright (C) 1992-2002 Washington University School of Medicine 4 * 5 * This source code is freely distributed under the terms of the 6 * GNU General Public License. See the files COPYRIGHT and LICENSE 7 * for details. 8 *****************************************************************/ 9 10 #ifndef SQUID_MSA_INCLUDED 11 #define SQUID_MSA_INCLUDED 12 13 /* msa.h 14 * SRE, Mon May 17 10:24:30 1999 15 * 16 * Header file for SQUID's multiple sequence alignment 17 * manipulation code. 18 * 19 * RCS $Id: msa.h 297 2014-10-31 13:02:37Z fabian $ (Original squid RCS Id: msa.h,v 1.12 2002/10/12 04:40:35 eddy Exp) 20 */ 21 22 #include <stdio.h> /* FILE support */ 23 #include "gki.h" /* hash table support */ 24 #include "ssi.h" /* sequence file index support */ 25 #include "squid.h" /* need SQINFO */ 26 27 /**************************************************** 28 * Obsolete alignment information, AINFO 29 * Superceded by MSA structure further below; but we 30 * need AINFO for the near future for backwards 31 * compatibility. 32 ****************************************************/ 33 /* Structure: aliinfo_s 34 * 35 * Purpose: Optional information returned from an alignment file. 36 * 37 * flags: always used. Flags for which info is valid/alloced. 38 * 39 * alen: mandatory. Alignments are always flushed right 40 * with gaps so that all aseqs are the same length, alen. 41 * Available for all alignment formats. 42 * 43 * nseq: mandatory. Aligned seqs are indexed 0..nseq-1. 44 * 45 * wgt: 0..nseq-1 vector of sequence weights. Mandatory. 46 * If not explicitly set, weights are initialized to 1.0. 47 * 48 * cs: 0..alen-1, just like the alignment. Contains single-letter 49 * secondary structure codes for consensus structure; "<>^+" 50 * for RNA, "EHL." for protein. May be NULL if unavailable 51 * from seqfile. Only available for SELEX format files. 52 * 53 * rf: 0..alen-1, just like the alignment. rf is an arbitrary string 54 * of characters, used for annotating columns. Blanks are 55 * interpreted as non-canonical columns and anything else is 56 * considered canonical. Only available from SELEX files. 57 * 58 * sqinfo: mandatory. Array of 0..nseq-1 59 * per-sequence information structures, carrying 60 * name, id, accession, coords. 61 * 62 */ 63 struct aliinfo_s { 64 int flags; /* flags for what info is valid */ 65 int alen; /* length of alignment (columns) */ 66 int nseq; /* number of seqs in alignment */ 67 float *wgt; /* sequence weights [0..nseq-1] */ 68 char *cs; /* consensus secondary structure string */ 69 char *rf; /* reference coordinate system */ 70 struct seqinfo_s *sqinfo; /* name, id, coord info for each sequence */ 71 72 /* Pfam/HMMER pick-ups */ 73 char *name; /* name of alignment */ 74 char *desc; /* description of alignment */ 75 char *acc; /* accession of alignment */ 76 char *au; /* "author" information */ 77 float tc1, tc2; /* trusted score cutoffs (per-seq, per-domain) */ 78 float nc1, nc2; /* noise score cutoffs (per-seq, per-domain) */ 79 float ga1, ga2; /* gathering cutoffs */ 80 }; 81 typedef struct aliinfo_s AINFO; 82 #define AINFO_TC (1 << 0) 83 #define AINFO_NC (1 << 1) 84 #define AINFO_GA (1 << 2) 85 86 /***************************************************************** 87 * MSA 88 * SRE, Sun Jun 27 15:03:35 1999 [TW 723 over Greenland] 89 * 90 * Defines the new data structure and API for multiple 91 * sequence alignment i/o. 92 *****************************************************************/ 93 94 /* The following constants define the Pfam/Rfam cutoff set we'll propagate 95 * from msa's into HMMER and Infernal models. 96 */ 97 #define MSA_CUTOFF_TC1 0 98 #define MSA_CUTOFF_TC2 1 99 #define MSA_CUTOFF_GA1 2 100 #define MSA_CUTOFF_GA2 3 101 #define MSA_CUTOFF_NC1 4 102 #define MSA_CUTOFF_NC2 5 103 #define MSA_MAXCUTOFFS 6 104 105 /* Structure: MSA 106 * SRE, Tue May 18 11:33:08 1999 107 * 108 * Our object for a multiple sequence alignment. 109 */ 110 typedef struct msa_struct { 111 /* Mandatory information associated with the alignment. 112 */ 113 char **aseq; /* the alignment itself, [0..nseq-1][0..alen-1] */ 114 char **sqname; /* names of sequences, [0..nseq-1][0..alen-1] */ 115 float *wgt; /* sequence weights [0..nseq-1] */ 116 int alen; /* length of alignment (columns) */ 117 int nseq; /* number of seqs in alignment */ 118 119 /* Optional information that we understand, and might have. 120 */ 121 int flags; /* flags for what optional info is valid */ 122 int type; /* kOtherSeq, kRNA/hmmNUCLEIC, or kAmino/hmmAMINO */ 123 char *name; /* name of alignment, or NULL */ 124 char *desc; /* description of alignment, or NULL */ 125 char *acc; /* accession of alignment, or NULL */ 126 char *au; /* "author" information, or NULL */ 127 char *ss_cons; /* consensus secondary structure string, or NULL */ 128 char *sa_cons; /* consensus surface accessibility string, or NULL */ 129 char *rf; /* reference coordinate system, or NULL */ 130 char **sqacc; /* accession numbers for individual sequences */ 131 char **sqdesc; /* description lines for individual sequences */ 132 char **ss; /* per-seq secondary structure annotation, or NULL */ 133 char **co; /* per-seq confidence of secondary structure annotation, or NULL, -> r296, FS */ 134 char **sa; /* per-seq surface accessibility annotation, or NULL */ 135 float cutoff[MSA_MAXCUTOFFS]; /* NC, TC, GA cutoffs propagated to Pfam/Rfam */ 136 int cutoff_is_set[MSA_MAXCUTOFFS];/* TRUE if a cutoff is set; else FALSE */ 137 138 /* Optional information that we don't understand. 139 * That is, we know what type of information it is, but it's 140 * either (interpreted as) free-text comment, or it's Stockholm 141 * markup with unfamiliar tags. 142 */ 143 char **comment; /* free text comments, or NULL */ 144 int ncomment; /* number of comment lines */ 145 int alloc_ncomment; /* number of comment lines alloc'ed */ 146 147 char **gf_tag; /* markup tags for unparsed #=GF lines */ 148 char **gf; /* annotations for unparsed #=GF lines */ 149 int ngf; /* number of unparsed #=GF lines */ 150 int alloc_ngf; /* number of gf lines alloc'ed */ 151 152 char **gs_tag; /* markup tags for unparsed #=GS lines */ 153 char ***gs; /* [0..ngs-1][0..nseq-1][free text] markup */ 154 GKI *gs_idx; /* hash of #=GS tag types */ 155 int ngs; /* number of #=GS tag types */ 156 157 char **gc_tag; /* markup tags for unparsed #=GC lines */ 158 char **gc; /* [0..ngc-1][0..alen-1] markup */ 159 GKI *gc_idx; /* hash of #=GC tag types */ 160 int ngc; /* number of #=GC tag types */ 161 162 char **gr_tag; /* markup tags for unparsed #=GR lines */ 163 char ***gr; /* [0..ngr][0..nseq-1][0..alen-1] markup */ 164 GKI *gr_idx; /* hash of #=GR tag types */ 165 int ngr; /* number of #=GR tag types */ 166 167 /* Stuff we need for our own maintenance of the data structure 168 */ 169 GKI *index; /* name ->seqidx hash table */ 170 int nseqalloc; /* number of seqs currently allocated for */ 171 int nseqlump; /* lump size for dynamic expansions of nseq */ 172 int *sqlen; /* individual sequence lengths during parsing */ 173 int *sslen; /* individual ss lengths during parsing */ 174 int *salen; /* individual sa lengths during parsing */ 175 int *colen; /* individual co lengths during parsing */ 176 int lastidx; /* last index we saw; use for guessing next */ 177 } MSA; 178 #define MSA_SET_WGT (1 << 0) /* track whether wgts were set, or left at default 1.0 */ 179 180 181 /* Structure: MSAFILE 182 * SRE, Tue May 18 11:36:54 1999 183 * 184 * Defines an alignment file that's open for reading. 185 */ 186 typedef struct msafile_struct { 187 FILE *f; /* open file pointer */ 188 char *fname; /* name of file. used for diagnostic output */ 189 int linenumber; /* what line are we on in the file */ 190 191 char *buf; /* buffer for line input w/ sre_fgets() */ 192 int buflen; /* current allocated length for buf */ 193 194 SSIFILE *ssi; /* open SSI index file; or NULL, if none. */ 195 196 int do_gzip; /* TRUE if f is a pipe from gzip -dc (need pclose(f)) */ 197 int do_stdin; /* TRUE if f is stdin (don't close f, not our problem) */ 198 int format; /* format of alignment file we're reading */ 199 } MSAFILE; 200 201 202 /* Alignment file formats. 203 * Must coexist with sqio.c/squid.h unaligned file format codes. 204 * Rules: 205 * - 0 is an unknown/unassigned format 206 * - <100 reserved for unaligned formats 207 * - >100 reserved for aligned formats 208 */ 209 #define MSAFILE_UNKNOWN 0 /* unknown format */ 210 #define MSAFILE_STOCKHOLM 101 /* Pfam/HMMER's Stockholm format */ 211 #define MSAFILE_SELEX 102 /* Obsolete(!): old HMMER/SELEX format */ 212 #define MSAFILE_MSF 103 /* GCG MSF format */ 213 #define MSAFILE_CLUSTAL 104 /* Clustal V/W format */ 214 #define MSAFILE_A2M 105 /* aligned FASTA (A2M is UCSC terminology) */ 215 #define MSAFILE_PHYLIP 106 /* Felsenstein's PHYLIP format */ 216 #define MSAFILE_EPS 107 /* Encapsulated PostScript (output only) */ 217 #ifdef CLUSTALO 218 #define MSAFILE_VIENNA 108 /* Vienna: concatenated fasta */ 219 #define MSAFILE_DUBLIN 109 /* Dublin: modified Stockholm format */ 220 #endif 221 222 #define IsAlignmentFormat(fmt) ((fmt) > 100) 223 224 225 /* from msa.c 226 */ 227 extern MSAFILE *MSAFileOpen(char *filename, int format, char *env); 228 extern MSA *MSAFileRead(MSAFILE *afp); 229 extern void MSAFileClose(MSAFILE *afp); 230 extern void MSAFree(MSA *msa); 231 #ifdef CLUSTALO 232 extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline, int iWrap, int bResno, int iSeqtype); 233 #else 234 extern void MSAFileWrite(FILE *fp, MSA *msa, int outfmt, int do_oneline); 235 #endif 236 237 extern int MSAFileRewind(MSAFILE *afp); 238 extern int MSAFilePositionByKey(MSAFILE *afp, char *key); 239 extern int MSAFilePositionByIndex(MSAFILE *afp, int idx); 240 241 extern int MSAFileFormat(MSAFILE *afp); 242 extern MSA *MSAAlloc(int nseq, int alen); 243 extern void MSAExpand(MSA *msa); 244 extern char *MSAFileGetLine(MSAFILE *afp); 245 extern void MSASetSeqAccession(MSA *msa, int seqidx, char *acc); 246 extern void MSASetSeqDescription(MSA *msa, int seqidx, char *desc); 247 extern void MSAAddComment(MSA *msa, char *s); 248 extern void MSAAddGF(MSA *msa, char *tag, char *value); 249 extern void MSAAddGS(MSA *msa, char *tag, int seqidx, char *value); 250 extern void MSAAppendGC(MSA *msa, char *tag, char *value); 251 extern char *MSAGetGC(MSA *msa, char *tag); 252 extern void MSAAppendGR(MSA *msa, char *tag, int seqidx, char *value); 253 extern void MSAVerifyParse(MSA *msa); 254 extern void MSAVerifyParseDub(MSA *msa); 255 extern int MSAGetSeqidx(MSA *msa, char *name, int guess); 256 257 extern MSA *MSAFromAINFO(char **aseq, AINFO *ainfo); 258 259 extern void MSAMingap(MSA *msa); 260 extern void MSANogap(MSA *msa); 261 extern void MSAShorterAlignment(MSA *msa, int *useme); 262 extern void MSASmallerAlignment(MSA *msa, int *useme, MSA **ret_new); 263 264 extern char *MSAGetSeqAccession(MSA *msa, int idx); 265 extern char *MSAGetSeqDescription(MSA *msa, int idx); 266 extern char *MSAGetSeqSS(MSA *msa, int idx); 267 extern char *MSAGetSeqSA(MSA *msa, int idx); 268 269 extern float MSAAverageSequenceLength(MSA *msa); 270 271 /* from a2m.c 272 */ 273 extern MSA *ReadA2M(MSAFILE *afp); 274 #ifdef CLUSTALO 275 /*extern void WriteA2M(FILE *fp, MSA *msa, int vienna);*/ 276 extern void WriteA2M(FILE *fp, MSA *msa, int iWrap); 277 #else 278 extern void WriteA2M(FILE *fp, MSA *msa); 279 #endif 280 /* from clustal.c 281 */ 282 extern MSA *ReadClustal(MSAFILE *afp); 283 #ifdef CLUSTALO 284 extern void WriteClustal(FILE *fp, MSA *msa, int iWrap, int bResno, int iSeqType); 285 #else 286 extern void WriteClustal(FILE *fp, MSA *msa); 287 #endif 288 289 /* from eps.c 290 */ 291 extern void EPSWriteSmallMSA(FILE *fp, MSA *msa); 292 293 /* from msf.c 294 */ 295 extern MSA *ReadMSF(MSAFILE *afp); 296 extern void WriteMSF(FILE *fp, MSA *msa); 297 298 /* from phylip.c 299 */ 300 extern MSA *ReadPhylip(MSAFILE *afp); 301 extern void WritePhylip(FILE *fp, MSA *msa); 302 303 /* from selex.c 304 */ 305 extern MSA *ReadSELEX(MSAFILE *afp); 306 extern void WriteSELEX(FILE *fp, MSA *msa); 307 extern void WriteSELEXOneBlock(FILE *fp, MSA *msa); 308 309 /* from stockholm.c 310 */ 311 extern MSA *ReadDublin(MSAFILE *afp); 312 extern MSA *ReadStockholm(MSAFILE *afp); 313 extern void WriteStockholm(FILE *fp, MSA *msa); 314 extern void WriteStockholmOneBlock(FILE *fp, MSA *msa); 315 316 #endif /*SQUID_MSA_INCLUDED*/ 317