1 /*
2  * Copyright 2011, Ben Langmead <langmea@cs.jhu.edu>
3  *
4  * This file is part of Bowtie 2.
5  *
6  * Bowtie 2 is free software: you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation, either version 3 of the License, or
9  * (at your option) any later version.
10  *
11  * Bowtie 2 is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with Bowtie 2.  If not, see <http://www.gnu.org/licenses/>.
18  */
19 
20 #include "pat.h"
21 
22 /**
23  * Parse a name from fb_ and store in r.  Assume that the next
24  * character obtained via fb_.get() is the first character of
25  * the sequence and the string stops at the next char upto (could
26  * be tab, newline, etc.).
27  */
parseName(Read & r,Read * r2,bool append,bool clearFirst,bool warnEmpty,bool useDefault,int upto)28 int QseqPatternSource::parseName(
29 	Read& r,      // buffer for mate 1
30 	Read* r2,     // buffer for mate 2 (NULL if mate2 is read separately)
31 	bool append,     // true -> append characters, false -> skip them
32 	bool clearFirst, // clear the name buffer first
33 	bool warnEmpty,  // emit a warning if nothing was added to the name
34 	bool useDefault, // if nothing is read, put readCnt_ as a default value
35 	int upto)        // stop parsing when we first reach character 'upto'
36 {
37 	if(clearFirst) {
38 		if(r2 != NULL) r2->name.clear();
39 		r.name.clear();
40 	}
41 	while(true) {
42 		int c;
43 		if((c = fb_.get()) < 0) {
44 			// EOF reached in the middle of the name
45 			return -1;
46 		}
47 		if(c == '\n' || c == '\r') {
48 			// EOL reached in the middle of the name
49 			return -1;
50 		}
51 		if(c == upto) {
52 			// Finished with field
53 			break;
54 		}
55 		if(append) {
56 			if(r2 != NULL) r2->name.append(c);
57 			r.name.append(c);
58 		}
59 	}
60 	// Set up a default name if one hasn't been set
61 	if(r.name.empty() && useDefault && append) {
62 		char cbuf[20];
63 		itoa10(readCnt_, cbuf);
64 		r.name.append(cbuf);
65 		if(r2 != NULL) r2->name.append(cbuf);
66 	}
67 	if(r.name.empty() && warnEmpty) {
68 		cerr << "Warning: read had an empty name field" << endl;
69 	}
70 	return (int)r.name.length();
71 }
72 
73 /**
74  * Parse a single sequence from fb_ and store in r.  Assume
75  * that the next character obtained via fb_.get() is the first
76  * character of the sequence and the sequence stops at the next
77  * char upto (could be tab, newline, etc.).
78  */
parseSeq(Read & r,int & charsRead,int & trim5,char upto)79 int QseqPatternSource::parseSeq(
80 	Read& r,
81 	int& charsRead,
82 	int& trim5,
83 	char upto)
84 {
85 	int begin = 0;
86 	int c = fb_.get();
87 	assert(c != upto);
88 	r.patFw.clear();
89 	r.color = gColor;
90 	if(gColor) {
91 		// NOTE: clearly this is not relevant for Illumina output, but
92 		// I'm keeping it here in case there's some reason to put SOLiD
93 		// data in this format in the future.
94 
95 		// This may be a primer character.  If so, keep it in the
96 		// 'primer' field of the read buf and parse the rest of the
97 		// read without it.
98 		c = toupper(c);
99 		if(asc2dnacat[c] > 0) {
100 			// First char is a DNA char
101 			int c2 = toupper(fb_.peek());
102 			// Second char is a color char
103 			if(asc2colcat[c2] > 0) {
104 				r.primer = c;
105 				r.trimc = c2;
106 				trim5 += 2; // trim primer and first color
107 			}
108 		}
109 		if(c < 0) { return -1; }
110 	}
111 	while(c != upto) {
112 		if(c == '.') c = 'N';
113 		if(gColor) {
114 			if(c >= '0' && c <= '4') c = "ACGTN"[(int)c - '0'];
115 		}
116 		if(isalpha(c)) {
117 			assert_in(toupper(c), "ACGTN");
118 			if(begin++ >= trim5) {
119 				assert_neq(0, asc2dnacat[c]);
120 				r.patFw.append(asc2dna[c]);
121 			}
122 			charsRead++;
123 		}
124 		if((c = fb_.get()) < 0) {
125 			return -1;
126 		}
127 	}
128 	r.patFw.trimEnd(gTrim3);
129 	return (int)r.patFw.length();
130 }
131 
132 /**
133  * Parse a single quality string from fb_ and store in r.
134  * Assume that the next character obtained via fb_.get() is
135  * the first character of the quality string and the string stops
136  * at the next char upto (could be tab, newline, etc.).
137  */
parseQuals(Read & r,int charsRead,int dstLen,int trim5,char & c2,char upto='\\t',char upto2=-1)138 int QseqPatternSource::parseQuals(
139 	Read& r,
140 	int charsRead,
141 	int dstLen,
142 	int trim5,
143 	char& c2,
144 	char upto = '\t',
145 	char upto2 = -1)
146 {
147 	int qualsRead = 0;
148 	int c = 0;
149 	if (intQuals_) {
150 		// Probably not relevant
151 		char buf[4096];
152 		while (qualsRead < charsRead) {
153 			qualToks_.clear();
154 			if(!tokenizeQualLine(fb_, buf, 4096, qualToks_)) break;
155 			for (unsigned int j = 0; j < qualToks_.size(); ++j) {
156 				char c = intToPhred33(atoi(qualToks_[j].c_str()), solQuals_);
157 				assert_geq(c, 33);
158 				if (qualsRead >= trim5) {
159 					r.qual.append(c);
160 				}
161 				++qualsRead;
162 			}
163 		} // done reading integer quality lines
164 		if (charsRead > qualsRead) tooFewQualities(r.name);
165 	} else {
166 		// Non-integer qualities
167 		while((qualsRead < dstLen + trim5) && c >= 0) {
168 			c = fb_.get();
169 			c2 = c;
170 			if (c == ' ') wrongQualityFormat(r.name);
171 			if(c < 0) {
172 				// EOF occurred in the middle of a read - abort
173 				return -1;
174 			}
175 			if(!isspace(c) && c != upto && (upto2 == -1 || c != upto2)) {
176 				if (qualsRead >= trim5) {
177 					c = charToPhred33(c, solQuals_, phred64Quals_);
178 					assert_geq(c, 33);
179 					r.qual.append(c);
180 				}
181 				qualsRead++;
182 			} else {
183 				break;
184 			}
185 		}
186 	}
187 	if(r.qual.length() < (size_t)dstLen) {
188 		tooFewQualities(r.name);
189 	}
190 	// TODO: How to detect too many qualities??
191 	r.qual.resize(dstLen);
192 	while(c != -1 && c != upto && (upto2 == -1 || c != upto2)) {
193 		c = fb_.get();
194 		c2 = c;
195 	}
196 	return qualsRead;
197 }
198 
199 /**
200  * Read another pattern from a Qseq input file.
201  */
read(Read & r,TReadId & rdid,TReadId & endid,bool & success,bool & done)202 bool QseqPatternSource::read(
203 	Read& r,
204 	TReadId& rdid,
205 	TReadId& endid,
206 	bool& success,
207 	bool& done)
208 {
209 	r.reset();
210 	r.color = gColor;
211 	success = true;
212 	done = false;
213 	readCnt_++;
214 	rdid = endid = readCnt_-1;
215 	peekOverNewline(fb_);
216 	fb_.resetLastN();
217 	// 1. Machine name
218 	if(parseName(r, NULL, true, true,  true, false, '\t') == -1) BAIL_UNPAIRED();
219 	assert_neq('\t', fb_.peek());
220 	r.name.append('_');
221 	// 2. Run number
222 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
223 	assert_neq('\t', fb_.peek());
224 	r.name.append('_');
225 	// 3. Lane number
226 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
227 	assert_neq('\t', fb_.peek());
228 	r.name.append('_');
229 	// 4. Tile number
230 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
231 	assert_neq('\t', fb_.peek());
232 	r.name.append('_');
233 	// 5. X coordinate of spot
234 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
235 	assert_neq('\t', fb_.peek());
236 	r.name.append('_');
237 	// 6. Y coordinate of spot
238 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
239 	assert_neq('\t', fb_.peek());
240 	r.name.append('_');
241 	// 7. Index
242 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
243 	assert_neq('\t', fb_.peek());
244 	r.name.append('/');
245 	// 8. Mate number
246 	if(parseName(r, NULL, true, false, true, false, '\t') == -1) BAIL_UNPAIRED();
247 	// Empty sequence??
248 	if(fb_.peek() == '\t') {
249 		// Get tab that separates seq from qual
250 		ASSERT_ONLY(int c =) fb_.get();
251 		assert_eq('\t', c);
252 		assert_eq('\t', fb_.peek());
253 		// Get tab that separates qual from filter
254 		ASSERT_ONLY(c =) fb_.get();
255 		assert_eq('\t', c);
256 		// Next char is first char of filter flag
257 		assert_neq('\t', fb_.peek());
258 		fb_.resetLastN();
259 		cerr << "Warning: skipping empty QSEQ read with name '" << r.name << "'" << endl;
260 	} else {
261 		assert_neq('\t', fb_.peek());
262 		int charsRead = 0;
263 		int mytrim5 = gTrim5;
264 		// 9. Sequence
265 		int dstLen = parseSeq(r, charsRead, mytrim5, '\t');
266 		assert_neq('\t', fb_.peek());
267 		if(dstLen < 0) BAIL_UNPAIRED();
268 		char ct = 0;
269 		// 10. Qualities
270 		if(parseQuals(r, charsRead, dstLen, mytrim5, ct, '\t', -1) < 0) BAIL_UNPAIRED();
271 		r.trimmed3 = gTrim3;
272 		r.trimmed5 = mytrim5;
273 		if(ct != '\t') {
274 			cerr << "Error: QSEQ with name " << r.name << " did not have tab after qualities" << endl;
275 			throw 1;
276 		}
277 		assert_eq(ct, '\t');
278 	}
279 	// 11. Filter flag
280 	int filt = fb_.get();
281 	if(filt == -1) BAIL_UNPAIRED();
282 	r.filter = filt;
283 	if(filt != '0' && filt != '1') {
284 		// Bad value for filt
285 	}
286 	if(fb_.peek() != -1 && fb_.peek() != '\n') {
287 		// Bad value right after the filt field
288 	}
289 	fb_.get();
290 	r.readOrigBuf.install(fb_.lastN(), fb_.lastNLen());
291 	fb_.resetLastN();
292 	if(r.qual.length() < r.patFw.length()) {
293 		tooFewQualities(r.name);
294 	} else if(r.qual.length() > r.patFw.length()) {
295 		tooManyQualities(r.name);
296 	}
297 #ifndef NDEBUG
298 	assert_eq(r.patFw.length(), r.qual.length());
299 	for(size_t i = 0; i < r.qual.length(); i++) {
300 		assert_geq((int)r.qual[i], 33);
301 	}
302 #endif
303 	return true;
304 }
305