1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2013, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __GZFASTQ_H__
22 #define __GZFASTQ_H__
23 
24 #ifdef HAVE_LIBZ
25 
26 #include <cerrno>
27 #include <zlib.h>
28 #include "input.h"
29 
30 class GzFastq: public Input {
31     gzFile gz_fh;
32 
33 public:
GzFastq(string path)34     GzFastq(string path) : Input() {
35         this->gz_fh = gzopen(path.c_str(), "rb");
36         if (!this->gz_fh) {
37             cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
38             exit(EXIT_FAILURE);
39         }
40         #if ZLIB_VERNUM >= 0x1240
41         gzbuffer(this->gz_fh, libz_buffer_size);
42         #endif
43     };
GzFastq(const char * path)44     GzFastq(const char *path) : Input() {
45         this->gz_fh = gzopen(path, "rb");
46         if (!this->gz_fh) {
47             cerr << "Failed to open gzipped file '" << path << "': " << strerror(errno) << ".\n";
48             exit(EXIT_FAILURE);
49         }
50         #if ZLIB_VERNUM >= 0x1240
51         gzbuffer(this->gz_fh, libz_buffer_size);
52         #endif
53     };
~GzFastq()54     ~GzFastq() {
55         gzclose(this->gz_fh);
56     };
57     Seq *next_seq();
58     int  next_seq(Seq &s);
59 };
60 
next_seq()61 Seq *GzFastq::next_seq() {
62     char *res = NULL;
63 
64     //
65     // Check the contents of the line buffer. When we finish reading a FASTQ record
66     // the buffer will either contain whitespace or the header of the next FASTQ
67     // record.
68     //
69     this->line[0] = '\0';
70     do {
71         res = gzgets(this->gz_fh, this->line, max_len);
72     } while (this->line[0] != '@' && res != NULL);
73 
74     if (res == NULL) {
75         return NULL;
76     }
77 
78     //
79     // Check if there is a carraige return in the buffer
80     //
81     uint len = strlen(this->line);
82     if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
83     if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
84 
85     //
86     // Initialize the Seq structure and store the FASTQ ID
87     //
88     Seq *s = new Seq;
89     s->id = new char[strlen(this->line) + 1];
90     strcpy(s->id, this->line + 1);
91 
92     //
93     // Read the sequence from the file
94     //
95     gzgets(this->gz_fh, this->line, max_len);
96 
97     if (gzeof(this->gz_fh)) {
98         return NULL;
99     }
100 
101     len = strlen(this->line);
102     if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
103     if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
104 
105     s->seq = new char[len + 1];
106     strcpy(s->seq, this->line);
107 
108     //
109     // Read the repeat of the ID
110     //
111     this->line[0] = '\0';
112     res = gzgets(this->gz_fh, this->line, max_len);
113 
114     if (this->line[0] != '+' || res == NULL) {
115         return NULL;
116     }
117 
118     //
119     // Read the quality score from the file
120     //
121     this->line[0] = '\0';
122     res = gzgets(this->gz_fh, this->line, max_len);
123 
124     if (res == NULL && strlen(this->line) == 0) {
125         return NULL;
126     }
127 
128     len = strlen(this->line);
129     if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
130     if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
131 
132     s->qual = new char[len + 1];
133     strcpy(s->qual, this->line);
134 
135     //
136     // Clear the line buffer so it is set up for the next record. If a '@'
137     // appears in the quality scores read, it will break parsing next time
138     // it is called.
139     //
140     this->line[0] = '\0';
141 
142     return s;
143 }
144 
next_seq(Seq & s)145 int GzFastq::next_seq(Seq &s) {
146     char *res = NULL;
147 
148     //
149     // Check the contents of the line buffer. When we finish reading a FASTQ record
150     // the buffer will either contain whitespace or the header of the next FASTQ
151     // record.
152     //
153     this->line[0] = '\0';
154     do {
155         res = gzgets(this->gz_fh, this->line, max_len);
156     } while (this->line[0] != '@' && res != NULL);
157 
158     if (res == NULL) {
159         return 0;
160     }
161 
162     //
163     // Check if there is a carraige return in the buffer
164     //
165     uint len = strlen(this->line);
166     if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
167     if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
168 
169     //
170     // Store the FASTQ ID
171     //
172     strcpy(s.id, this->line + 1);
173 
174     //
175     // Read the sequence from the file
176     //
177     this->line[0] = '\0';
178     res = gzgets(this->gz_fh, this->line, max_len);
179 
180     if (res == NULL) {
181         return 0;
182     }
183 
184     len = strlen(this->line);
185     if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
186     if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
187 
188     strcpy(s.seq, this->line);
189 
190     //
191     // Read the repeat of the ID
192     //
193     this->line[0] = '\0';
194     res = gzgets(this->gz_fh, this->line, max_len);
195 
196     if (this->line[0] != '+' || res == NULL) {
197         return 0;
198     }
199 
200     //
201     // Read the quality score from the file
202     //
203     this->line[0] = '\0';
204     res = gzgets(this->gz_fh, this->line, max_len);
205 
206     if (res == NULL && strlen(this->line) == 0) {
207         return 0;
208     }
209 
210     len = strlen(this->line);
211     if (len > 0 && this->line[len - 1] == '\n') this->line[len - 1] = '\0';
212     if (len > 0 && this->line[len - 2] == '\r') this->line[len - 2] = '\0';
213 
214     strcpy(s.qual, this->line);
215 
216     //
217     // Clear the line buffer so it is set up for the next record. If a '@'
218     // appears in the quality scores read, it will break parsing next time
219     // it is called.
220     //
221     this->line[0] = '\0';
222 
223     return 1;
224 }
225 
226 #else  // If HAVE_LIBZ is undefined and zlib library is not present.
227 
228 #include "input.h"
229 
230 class GzFastq: public Input {
231  public:
GzFastq(const char * path)232     GzFastq(const char *path) : Input() { cerr << "Gzip support was not enabled when Stacks was compiled.\n"; };
GzFastq(string path)233     GzFastq(string path) : Input() { cerr << "Gzip support was not enabled when Stacks was compiled.\n"; };
~GzFastq()234     ~GzFastq() {};
next_seq()235     Seq *next_seq()      { return NULL; };
next_seq(Seq &)236     int  next_seq(Seq &) { return 0; };
237 };
238 
239 #endif // HAVE_LIBZ
240 
241 #endif // __GZFASTQ_H__
242