1 /*
2  * Copyright (c) 2003, 2005-2007, 2010, 2013 Genome Research Ltd.
3  * Author(s): James Bonfield
4  *
5  * Redistribution and use in source and binary forms, with or without
6  * modification, are permitted provided that the following conditions are met:
7  *
8  *    1. Redistributions of source code must retain the above copyright notice,
9  *       this list of conditions and the following disclaimer.
10  *
11  *    2. Redistributions in binary form must reproduce the above
12  *       copyright notice, this list of conditions and the following
13  *       disclaimer in the documentation and/or other materials provided
14  *       with the distribution.
15  *
16  *    3. Neither the names Genome Research Ltd and Wellcome Trust Sanger
17  *    Institute nor the names of its contributors may be used to endorse
18  *    or promote products derived from this software without specific
19  *    prior written permission.
20  *
21  * THIS SOFTWARE IS PROVIDED BY GENOME RESEARCH LTD AND CONTRIBUTORS "AS
22  * IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
23  * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
24  * PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL GENOME RESEARCH
25  * LTD OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
26  * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
27  * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
28  * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
29  * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
30  * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
31  * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
32  */
33 
34 /*
35  * Author(s): James Bonfield, Simon Dear, Rodger Staden
36  *
37  * Copyright (c) 1995, 1997-1998, 2000-2001 MEDICAL RESEARCH COUNCIL
38  * All rights reserved
39  *
40  * Redistribution and use in source and binary forms, with or without
41  * modification, are permitted provided that the following conditions are met:
42  *
43  *    1 Redistributions of source code must retain the above copyright notice,
44  *      this list of conditions and the following disclaimer.
45  *
46  *    2 Redistributions in binary form must reproduce the above copyright
47  *      notice, this list of conditions and the following disclaimer in
48  *      the documentation and/or other materials provided with the
49  *      distribution.
50  *
51  *    3 Neither the name of the MEDICAL RESEARCH COUNCIL, THE LABORATORY OF
52  *      MOLECULAR BIOLOGY nor the names of its contributors may be used
53  *      to endorse or promote products derived from this software without
54  *      specific prior written permission.
55  *
56  * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
57  * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
58  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
59  * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
60  * LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
61  * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
62  * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
63  * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
64  * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
65  * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
66  * POSSIBILITY OF SUCH DAMAGE.
67  */
68 
69 /*
70  * Copyright (c) Medical Research Council 1994. All rights reserved.
71  *
72  * Permission to use, copy, modify and distribute this software and its
73  * documentation for any purpose is hereby granted without fee, provided that
74  * this copyright and notice appears in all copies.
75  *
76  * This file was written by James Bonfield, Simon Dear, Rodger Staden,
77  * as part of the Staden Package at the MRC Laboratory of Molecular
78  * Biology, Hills Road, Cambridge, CB2 2QH, United Kingdom.
79  *
80  * MRC disclaims all warranties with regard to this software.
81  */
82 
83 #ifndef _Read_h_
84 #define _Read_h_
85 
86 /*
87  * Title:	Read
88  *
89  * File: 	Read.h
90  * Purpose:	Read data type
91  * Last update:	June  14 1994
92  */
93 
94 /*
95  * This module encodes the `Read' sequence data structure.
96  *
97  * A `Read' contains information about bases and traces which are laid
98  * out along a single dimension of points. The number of points in a
99  * paricular sequence is given by `getNPoints', and these are numbered
100  * 0..getNPoints-1. At each point there are four trace readings, one
101  * for each base.
102  *
103  * The number of bases is `getNBases' which are numbered 0..N-1.
104  * Bases are represented by `char's. Every base is located at a
105  * particular point.
106  *
107  * The behaviour of these routines is undefined if given NULLRead or
108  * an undefined sequence.
109  */
110 
111 #include "io_lib/os.h"
112 #include "io_lib/scf.h"
113 #include "io_lib/mFILE.h"
114 
115 #ifdef __cplusplus
116 extern "C" {
117 #endif
118 
119 /*
120  *-----------------------------------------------------------------------------
121  * Macros
122  *-----------------------------------------------------------------------------
123  */
124 
125 #define NULLRead     ((Read *)NULL)
126 
127 /* Trace file formats */
128 #define TT_ERR -1
129 #define TT_UNK 0
130 #define TT_SCF 1
131 #define TT_ABI 2
132 #define TT_ALF 3
133 #define TT_PLN 4
134 #define TT_EXP 5
135 #define TT_ZTR 7
136 #define TT_ZTR1 8
137 #define TT_ZTR2 9
138 #define TT_ZTR3 10
139 #define TT_BIO 11
140 #define TT_SFF 12
141 #define TT_ANY TT_UNK
142 /* ANYTR is specifically any *trace* type and not EXP or PLN format */
143 #define TT_ANYTR 13
144 
145 #define READ_BASES	(1<<0)
146 #define READ_SAMPLES	(1<<1)
147 #define READ_COMMENTS	(1<<2)
148 #define READ_ALL	(READ_BASES | READ_SAMPLES | READ_COMMENTS)
149 
150 /*
151  *-----------------------------------------------------------------------------
152  * Structures and typedefs
153  *-----------------------------------------------------------------------------
154  */
155 
156 typedef uint_2 TRACE;        /* for trace heights */
157 
158 typedef struct
159 {
160     int		format;	     /* Trace file format */
161     char       *trace_name;  /* Trace file name */
162 
163     int         NPoints;     /* No. of points of data */
164     int         NBases;      /* No. of bases */
165 
166     /* Traces */
167     TRACE      *traceA;      /* Array of length `NPoints' */
168     TRACE      *traceC;      /* Array of length `NPoints' */
169     TRACE      *traceG;      /* Array of length `NPoints' */
170     TRACE      *traceT;      /* Array of length `NPoints' */
171     TRACE       maxTraceVal; /* The maximal value in any trace */
172     int         baseline;    /* The zero offset for TRACE values */
173 
174     /* Bases */
175     char       *base;        /* Array of length `NBases' */
176     uint_2     *basePos;     /* Array of length `NBases' */
177 
178     /* Cutoffs */
179     int         leftCutoff;  /* Number of unwanted bases */
180     int         rightCutoff; /* First unwanted base at right end */
181 
182     /* Miscellaneous Sequence Information */
183     char       *info;        /* misc seq info, eg comments */
184 
185     /* Probability information */
186     char       *prob_A;      /* Array of length 'NBases' */
187     char       *prob_C;      /* Array of length 'NBases' */
188     char       *prob_G;      /* Array of length 'NBases' */
189     char       *prob_T;      /* Array of length 'NBases' */
190 
191     /* The original input format data, or NULL if inapplicable */
192     int orig_trace_format;
193     void (*orig_trace_free)(void *ptr);
194     void *orig_trace;
195 
196     char       *ident;	     /* Seq id, NULL for unknown. Malloced data.
197 				Owned and freed by io_lib. */
198 
199     /* Pyrosequencing "peaks" (more like spikes). NULL if not used */
200     int          nflows;     /* Number of "flows" */
201     char        *flow_order; /* Bases flowed across */
202     float       *flow;       /* Processed to be 1 base unit oriented */
203     unsigned int*flow_raw;   /* Unprocessed data */
204 
205     void *private_data;	     /* The 'private data' block and size from SCF, */
206     int private_size;        /*         NULL & 0 if not present.            */
207 } Read;
208 
209 
210 /*
211  *-----------------------------------------------------------------------------
212  * Function prototypes
213  *-----------------------------------------------------------------------------
214  */
215 
216 
217 /* ----- Main I/O routines ----- */
218 
219 /*
220  * Read a sequence from a file "fn" of format "format". If "format" is 0
221  * (TT_ANY), we automatically determine the correct format.
222  *
223  * Returns:
224  *   Read *   for success
225  *   NULLRead for failure
226  */
227 Read *read_reading(char *fn, int format);
228 Read *fread_reading(FILE *fp, char *fn, int format);
229 Read *mfread_reading(mFILE *fp, char *fn, int format);
230 
231 
232 /*
233  * Write a sequence to a file "fn" of format "format". If "format" is 0,
234  * we choose our favourite - SCF.
235  *
236  * Returns:
237  *   0 for success
238  *  -1 for failure
239  */
240 int write_reading(char *fn, Read *read, int format);
241 int fwrite_reading(FILE *fp, Read *read, int format);
242 int mfwrite_reading(mFILE *fp, Read *read, int format);
243 
244 
245 /* ----- Utility routines ----- */
246 
247 /*
248  * Allocate a new sequence, with the given sizes.
249  * Returns:
250  *   "Read *" for success
251  *   "NULLRead" for failure
252  */
253 Read *read_allocate(int num_points, int num_bases);
254 
255 
256 /*
257  * Duplicates the read structure and optionally gives it a new filename.
258  * The following fields are not duplicated:
259  *
260  *  int  orig_trace_format;
261  *  void (*orig_trace_free)(void *ptr);
262  *  void *orig_trace;
263  *  char *ident;
264  *
265  * Returns:
266  *   "Read *" for success
267  *   "NULLRead" for failure
268  */
269 Read* read_dup( Read* src, const char* new_name );
270 
271 
272 /*
273  * Free memory allocated to a sequence by read_allocate().
274  */
275 void read_deallocate(Read *read);
276 
277 /* unix specific file deletion routine */
278 
279 int remove_file(char *fn);
280 
281 Read *read_abi(char *fn);
282 Read *fread_abi(FILE *fp);
283 Read *mfread_abi(mFILE *fp);
284 int write_abi(char *fn, Read *read);
285 int fwrite_abi(FILE *fp, Read *read);
286 int mfwrite_abi(mFILE *fp, Read *read);
287 
288 int write_alf(char *fn, Read *read);
289 int fwrite_alf(FILE *fp, Read *read);
290 int mfwrite_alf(mFILE *fp, Read *read);
291 Read *read_alf(char *fn);
292 Read *fread_alf(FILE *fp);
293 Read *mfread_alf(mFILE *fp);
294 
295 int write_pln(char *fn, Read *read);
296 int fwrite_pln(FILE *fp, Read *read);
297 int mfwrite_pln(mFILE *fp, Read *read);
298 Read *read_pln(char *fn);
299 Read *fread_pln(FILE *fp);
300 Read *mfread_pln(mFILE *fp);
301 
302 int read_sections(int sec);
303 
304 #include "io_lib/translate.h"
305 #include "io_lib/compress.h"
306 
307 #ifdef __cplusplus
308 }
309 #endif
310 
311 #endif /* _Read_h_ */
312