1 /*
2 * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3 *
4 * This file is part of Bowtie 2.
5 *
6 * Bowtie 2 is free software: you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation, either version 3 of the License, or
9 * (at your option) any later version.
10 *
11 * Bowtie 2 is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with Bowtie 2. If not, see <http://www.gnu.org/licenses/>.
18 */
19
20 #include "pat.h"
21
22 /**
23 * Parse a name from fb_ and store in r. Assume that the next
24 * character obtained via fb_.get() is the first character of
25 * the sequence and the string stops at the next char upto (could
26 * be tab, newline, etc.).
27 */
parseName(Read & r,Read * r2,bool append,bool clearFirst,bool warnEmpty,bool useDefault,int upto)28 int QseqPatternSource::parseName(
29 Read& r, // buffer for mate 1
30 Read* r2, // buffer for mate 2 (NULL if mate2 is read separately)
31 bool append, // true -> append characters, false -> skip them
32 bool clearFirst, // clear the name buffer first
33 bool warnEmpty, // emit a warning if nothing was added to the name
34 bool useDefault, // if nothing is read, put readCnt_ as a default value
35 int upto) // stop parsing when we first reach character 'upto'
36 {
37 if(clearFirst) {
38 if(r2 != NULL) r2->name.clear();
39 r.name.clear();
40 }
41 while(true) {
42 int c;
43 if((c = fb_.get()) < 0) {
44 // EOF reached in the middle of the name
45 return -1;
46 }
47 if(c == '\n' || c == '\r') {
48 // EOL reached in the middle of the name
49 return -1;
50 }
51 if(c == upto) {
52 // Finished with field
53 break;
54 }
55 if(append) {
56 if(r2 != NULL) r2->name.append(c);
57 r.name.append(c);
58 }
59 }
60 // Set up a default name if one hasn't been set
61 if(r.name.empty() && useDefault && append) {
62 char cbuf[20];
63 itoa10(readCnt_, cbuf);
64 r.name.append(cbuf);
65 if(r2 != NULL) r2->name.append(cbuf);
66 }
67 if(r.name.empty() && warnEmpty) {
68 cerr << "Warning: read had an empty name field" << endl;
69 }
70 return (int)r.name.length();
71 }
72
73 /**
74 * Parse a single sequence from fb_ and store in r. Assume
75 * that the next character obtained via fb_.get() is the first
76 * character of the sequence and the sequence stops at the next
77 * char upto (could be tab, newline, etc.).
78 */
parseSeq(Read & r,int & charsRead,int & trim5,char upto)79 int QseqPatternSource::parseSeq(
80 Read& r,
81 int& charsRead,
82 int& trim5,
83 char upto)
84 {
85 int begin = 0;
86 int c = fb_.get();
87 assert(c != upto);
88 r.patFw.clear();
89 r.color = gColor;
90 if(gColor) {
91 // NOTE: clearly this is not relevant for Illumina output, but
92 // I'm keeping it here in case there's some reason to put SOLiD
93 // data in this format in the future.
94
95 // This may be a primer character. If so, keep it in the
96 // 'primer' field of the read buf and parse the rest of the
97 // read without it.
98 c = toupper(c);
99 if(asc2dnacat[c] > 0) {
100 // First char is a DNA char
101 int c2 = toupper(fb_.peek());
102 // Second char is a color char
103 if(asc2colcat[c2] > 0) {
104 r.primer = c;
105 r.trimc = c2;
106 trim5 += 2; // trim primer and first color
107 }
108 }
109 if(c < 0) { return -1; }
110 }
111 while(c != upto) {
112 if(c == '.') c = 'N';
113 if(gColor) {
114 if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
115 }
116 if(isalpha(c)) {
117 assert_in(toupper(c), "ACGTN");
118 if(begin++ >= trim5) {
119 assert_neq(0, asc2dnacat[c]);
120 r.patFw.append(asc2dna[c]);
121 }
122 charsRead++;
123 }
124 if((c = fb_.get()) < 0) {
125 return -1;
126 }
127 }
128 r.patFw.trimEnd(gTrim3);
129 return (int)r.patFw.length();
130 }
131
132 /**
133 * Parse a single quality string from fb_ and store in r.
134 * Assume that the next character obtained via fb_.get() is
135 * the first character of the quality string and the string stops
136 * at the next char upto (could be tab, newline, etc.).
137 */
parseQuals(Read & r,int charsRead,int dstLen,int trim5,char & c2,char upto='\\t',char upto2=-1)138 int QseqPatternSource::parseQuals(
139 Read& r,
140 int charsRead,
141 int dstLen,
142 int trim5,
143 char& c2,
144 char upto = '\t',
145 char upto2 = -1)
146 {
147 int qualsRead = 0;
148 int c = 0;
149 if (intQuals_) {
150 // Probably not relevant
151 char buf[4096];
152 while (qualsRead < charsRead) {
153 qualToks_.clear();
154 if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
155 for (unsigned int j = 0; j < qualToks_.size(); ++j) {
156 char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
157 assert_geq(c, 33);
158 if (qualsRead >= trim5) {
159 r.qual.append(c);
160 }
161 ++qualsRead;
162 }
163 } // done reading integer quality lines
164 if (charsRead > qualsRead) tooFewQualities(r.name);
165 } else {
166 // Non-integer qualities
167 while((qualsRead < dstLen + trim5) && c >= 0) {
168 c = fb_.get();
169 c2 = c;
170 if (c == ' ') wrongQualityFormat(r.name);
171 if(c < 0) {
172 // EOF occurred in the middle of a read - abort
173 return -1;
174 }
175 if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
176 if (qualsRead >= trim5) {
177 c = charToPhred33(c, solQuals_, phred64Quals_);
178 assert_geq(c, 33);
179 r.qual.append(c);
180 }
181 qualsRead++;
182 } else {
183 break;
184 }
185 }
186 }
187 if(r.qual.length() < (size_t)dstLen) {
188 tooFewQualities(r.name);
189 }
190 // TODO: How to detect too many qualities??
191 r.qual.resize(dstLen);
192 while(c != -1 && c != upto && (upto2 == -1 || c != upto2)) {
193 c = fb_.get();
194 c2 = c;
195 }
196 return qualsRead;
197 }
198
199 /**
200 * Read another pattern from a Qseq input file.
201 */
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)202 bool QseqPatternSource::read(
203 Read& r,
204 TReadId& rdid,
205 TReadId& endid,
206 bool& success,
207 bool& done)
208 {
209 r.reset();
210 r.color = gColor;
211 success = true;
212 done = false;
213 readCnt_++;
214 rdid = endid = readCnt_-1;
215 peekOverNewline(fb_);
216 fb_.resetLastN();
217 // 1. Machine name
218 if(parseName(r, NULL, true, true, true, false, '\t') == -1) BAIL_UNPAIRED();
219 assert_neq('\t', fb_.peek());
220 r.name.append('_');
221 // 2. Run number
222 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
223 assert_neq('\t', fb_.peek());
224 r.name.append('_');
225 // 3. Lane number
226 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
227 assert_neq('\t', fb_.peek());
228 r.name.append('_');
229 // 4. Tile number
230 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
231 assert_neq('\t', fb_.peek());
232 r.name.append('_');
233 // 5. X coordinate of spot
234 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
235 assert_neq('\t', fb_.peek());
236 r.name.append('_');
237 // 6. Y coordinate of spot
238 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
239 assert_neq('\t', fb_.peek());
240 r.name.append('_');
241 // 7. Index
242 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
243 assert_neq('\t', fb_.peek());
244 r.name.append('/');
245 // 8. Mate number
246 if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
247 // Empty sequence??
248 if(fb_.peek() == '\t') {
249 // Get tab that separates seq from qual
250 ASSERT_ONLY(int c =) fb_.get();
251 assert_eq('\t', c);
252 assert_eq('\t', fb_.peek());
253 // Get tab that separates qual from filter
254 ASSERT_ONLY(c =) fb_.get();
255 assert_eq('\t', c);
256 // Next char is first char of filter flag
257 assert_neq('\t', fb_.peek());
258 fb_.resetLastN();
259 cerr << "Warning: skipping empty QSEQ read with name '" << r.name << "'" << endl;
260 } else {
261 assert_neq('\t', fb_.peek());
262 int charsRead = 0;
263 int mytrim5 = gTrim5;
264 // 9. Sequence
265 int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
266 assert_neq('\t', fb_.peek());
267 if(dstLen < 0) BAIL_UNPAIRED();
268 char ct = 0;
269 // 10. Qualities
270 if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\t', -1) < 0) BAIL_UNPAIRED();
271 r.trimmed3 = gTrim3;
272 r.trimmed5 = mytrim5;
273 if(ct != '\t') {
274 cerr << "Error: QSEQ with name " << r.name << " did not have tab after qualities" << endl;
275 throw 1;
276 }
277 assert_eq(ct, '\t');
278 }
279 // 11. Filter flag
280 int filt = fb_.get();
281 if(filt == -1) BAIL_UNPAIRED();
282 r.filter = filt;
283 if(filt != '0' && filt != '1') {
284 // Bad value for filt
285 }
286 if(fb_.peek() != -1 && fb_.peek() != '\n') {
287 // Bad value right after the filt field
288 }
289 fb_.get();
290 r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
291 fb_.resetLastN();
292 if(r.qual.length() < r.patFw.length()) {
293 tooFewQualities(r.name);
294 } else if(r.qual.length() > r.patFw.length()) {
295 tooManyQualities(r.name);
296 }
297 #ifndef NDEBUG
298 assert_eq(r.patFw.length(), r.qual.length());
299 for(size_t i = 0; i < r.qual.length(); i++) {
300 assert_geq((int)r.qual[i], 33);
301 }
302 #endif
303 return true;
304 }
305