1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 #ifndef __CLEAN_H__
22 #define __CLEAN_H__
23
24 #include <string>
25 #include <map>
26 #include <iostream>
27 #include <fstream>
28 #include <unordered_map>
29
30 #include "input.h"
31 #include "kmers.h"
32
33 enum fastqt {generic_fastq, illv1_fastq, illv2_fastq};
34
35 enum barcodet {null_null,
36 null_inline, null_index,
37 inline_null, index_null,
38 inline_inline, index_index,
39 inline_index, index_inline};
40 enum seqt {single_end, paired_end};
41
42 typedef unordered_map<string, vector<int>, std::hash<string> > AdapterHash;
43
44 extern uint min_bc_size_1, max_bc_size_1, min_bc_size_2, max_bc_size_2;
45 extern int barcode_dist_1, barcode_dist_2;
46 extern barcodet barcode_type;
47 extern uint truncate_seq;
48 extern double win_size;
49 extern bool paired;
50 extern bool recover;
51
52 class BarcodePair {
53 public:
54 string se; // Single-end barcode.
55 string pe; // Paired-end barcode.
56 string name; // Filename to open for this barcode combination.
57
BarcodePair()58 BarcodePair()
59 {
60 this->se = "";
61 this->pe = "";
62 }
BarcodePair(char * p)63 BarcodePair(char *p)
64 {
65 this->se = string(p);
66 this->pe = "";
67 }
BarcodePair(char * p,char * q,char * n)68 BarcodePair(char *p, char *q, char *n)
69 {
70 if (p != NULL)
71 this->se = string(p);
72 if (q != NULL)
73 this->pe = string(q);
74 if (n != NULL)
75 this->name = string(n);
76 }
BarcodePair(string se,string pe,string name)77 BarcodePair(string se, string pe, string name)
78 {
79 this->se = se;
80 this->pe = pe;
81 this->name = name;
82 }
BarcodePair(string se)83 BarcodePair(string se)
84 {
85 this->se = se;
86 this->pe = "";
87 }
set(char * p,char * q)88 void set(char *p, char *q)
89 {
90 this->se = string(p);
91 this->pe = string(q);
92 }
set(char * p)93 void set(char *p)
94 {
95 this->se = string(p);
96 this->pe = "";
97 }
set(string p,string q)98 void set(string p, string q)
99 {
100 this->se = p;
101 this->pe = q;
102 }
set(string p)103 void set(string p)
104 {
105 this->se = p;
106 this->pe = "";
107 }
str()108 string str()
109 {
110 if (this->pe.length() > 0)
111 return string(this->se + "-" + this->pe);
112 else
113 return this->se;
114 }
name_exists()115 bool name_exists()
116 {
117 if (this->name.length() > 0)
118 return true;
119 return false;
120 }
121 friend bool operator<(const BarcodePair &lhs, const BarcodePair &rhs)
122 {
123 if (lhs.se < rhs.se)
124 return true;
125 else if (lhs.se == rhs.se && lhs.pe < rhs.pe)
126 return true;
127 else
128 return false;
129 }
130 friend bool operator==(const BarcodePair &lhs, const BarcodePair &rhs)
131 {
132 return (lhs.se == rhs.se && lhs.pe == rhs.pe);
133 }
134 friend ofstream& operator<<(ofstream &out, const BarcodePair &bp)
135 {
136 if (bp.pe.length() > 0)
137 out << bp.se << "-" << bp.pe;
138 else
139 out << bp.se;
140 return out;
141 }
142 };
143
144 class RawRead {
145 public:
146 fastqt fastq_type;
147 char *inline_bc;
148 char *index_bc;
149 char *se_bc;
150 char *pe_bc;
151 char *machine;
152 int run;
153 int lane;
154 int tile;
155 int x;
156 int y;
157 int index;
158 int read;
159 char *seq;
160 char *phred;
161 int *int_scores;
162 bool filter;
163 int inline_bc_len;
164 int retain;
165 uint size;
166 uint len;
167 double win_len;
168 double stop_pos;
169
RawRead(uint buf_len,int read,int barcode_size,double win_size)170 RawRead(uint buf_len, int read, int barcode_size, double win_size) {
171 this->inline_bc = new char[id_len + 1];
172 this->index_bc = new char[id_len + 1];
173 this->machine = new char[id_len + 1];
174 this->seq = new char[buf_len + 1];
175 this->phred = new char[buf_len + 1];
176 this->int_scores = new int[buf_len];
177 this->size = buf_len + 1;
178 this->read = read;
179
180 this->retain = 1;
181 this->inline_bc_len = 0;
182 this->tile = 0;
183 this->run = 0;
184 this->lane = 0;
185 this->x = 0;
186 this->y = 0;
187 this->index = 0;
188 this->len = 0;
189
190 this->inline_bc[0] = '\0';
191 this->index_bc[0] = '\0';
192 this->machine[0] = '\0';
193 this->seq[0] = '\0';
194 this->phred[0] = '\0';
195
196 this->set_len(buf_len);
197
198 this->se_bc = NULL;
199 this->pe_bc = NULL;
200 if (this->read == 1) {
201 switch(barcode_type) {
202 case index_inline:
203 this->se_bc = this->index_bc;
204 this->pe_bc = this->inline_bc;
205 break;
206 case inline_index:
207 this->se_bc = this->inline_bc;
208 this->pe_bc = this->index_bc;
209 this->inline_bc_len = barcode_size;
210 break;
211 case inline_null:
212 case inline_inline:
213 this->se_bc = this->inline_bc;
214 this->inline_bc_len = barcode_size;
215 break;
216 case index_null:
217 case index_index:
218 this->se_bc = this->index_bc;
219 break;
220 default:
221 break;
222 }
223 } else if (this->read == 2) {
224 switch(barcode_type) {
225 case null_inline:
226 case inline_inline:
227 case index_inline:
228 this->pe_bc = this->inline_bc;
229 this->inline_bc_len = barcode_size;
230 break;
231 case index_index:
232 case inline_index:
233 this->pe_bc = this->index_bc;
234 break;
235 default:
236 break;
237 }
238 }
239 }
~RawRead()240 ~RawRead() {
241 delete [] this->inline_bc;
242 delete [] this->index_bc;
243 delete [] this->machine;
244 delete [] this->seq;
245 delete [] this->phred;
246 delete [] this->int_scores;
247 }
resize(int size)248 int resize(int size) {
249 delete [] this->seq;
250 delete [] this->phred;
251 delete [] this->int_scores;
252 this->size = size;
253 this->seq = new char[this->size];
254 this->phred = new char[this->size];
255 this->int_scores = new int[this->size - 1];
256
257 this->set_len(size - 1);
258
259 return 0;
260 }
set_len(uint buf_len)261 int set_len(uint buf_len) {
262 if (buf_len == this->len)
263 return 0;
264
265 if (buf_len > this->size - 1)
266 buf_len = this->size - 1;
267
268 this->seq[buf_len] = '\0';
269 this->phred[buf_len] = '\0';
270
271 //
272 // Set the parameters for checking read quality later in processing.
273 // Window length is 15% (rounded) of the sequence length.
274 //
275 this->len = buf_len - this->inline_bc_len;
276 this->win_len = round((double) this->len * win_size);
277
278 if (this->win_len < 1)
279 this->win_len = 1;
280
281 this->len += this->inline_bc_len;
282 this->stop_pos = this->len - this->win_len;
283
284 return 0;
285 }
286 };
287
288 int parse_illumina_v1(const char *);
289 int parse_illumina_v2(const char *);
290 int parse_input_record(Seq *, RawRead *);
291 int rev_complement(char *, int, bool);
292 int reverse_qual(char *, int, bool);
293
294 bool correct_barcode(set<string> &, RawRead *, seqt, int);
295
296 int filter_adapter_seq(RawRead *, char *, int, AdapterHash &, int, int, int);
297 int init_adapter_seq(int, char *, int &, AdapterHash &);
298
299 int check_quality_scores(RawRead *, int, int, int, int);
300
301 //
302 // Templated function to process barcodes.
303 //
304 template<typename fhType>
305 int
process_barcode(RawRead * href_1,RawRead * href_2,BarcodePair & bc,map<BarcodePair,fhType * > & fhs,set<string> & se_bc,set<string> & pe_bc,map<BarcodePair,map<string,long>> & barcode_log,map<string,long> & counter)306 process_barcode(RawRead *href_1, RawRead *href_2, BarcodePair &bc,
307 map<BarcodePair, fhType *> &fhs,
308 set<string> &se_bc, set<string> &pe_bc,
309 map<BarcodePair, map<string, long> > &barcode_log, map<string, long> &counter)
310 {
311 if (barcode_type == null_null)
312 return 0;
313
314 //
315 // Is this a legitimate barcode? The barcode passed into this function is the maximally long
316 // barcode. If we fail to find a match at maximum length, step down to minimum length and
317 // continue to search for a match.
318 //
319 char *p;
320 char bc_1[id_len];
321 char bc_2[id_len];
322 strcpy(bc_1, bc.se.c_str());
323 strcpy(bc_2, bc.pe.c_str());
324
325 bool valid_se_bc = false;
326 bool valid_pe_bc = false;
327
328 p = bc_1 + max_bc_size_1; // Point p at the end of string NULL.
329 for (uint i = max_bc_size_1; i >= min_bc_size_1; i--)
330 if (se_bc.count(bc_1) > 0) {
331 valid_se_bc = true;
332 break;
333 } else {
334 p--;
335 *p = '\0';
336 }
337 if (pe_bc.size() > 0) {
338 p = bc_2 + max_bc_size_2; // Point p at the end of string NULL.
339 for (uint i = max_bc_size_2; i >= min_bc_size_2; i--)
340 if (pe_bc.count(bc_2) > 0) {
341 valid_pe_bc = true;
342 break;
343 } else {
344 p--;
345 *p = '\0';
346 }
347 }
348 if (valid_se_bc == true && valid_pe_bc == true)
349 bc.set(bc_1, bc_2);
350 else if (valid_se_bc == true)
351 bc.se = bc_1;
352 else if (valid_pe_bc == true)
353 bc.pe = bc_2;
354
355 //
356 // Log the barcodes we receive.
357 //
358 if (barcode_log.count(bc) == 0) {
359 barcode_log[bc]["noradtag"] = 0;
360 barcode_log[bc]["total"] = 0;
361 barcode_log[bc]["low_qual"] = 0;
362 barcode_log[bc]["retained"] = 0;
363 }
364 barcode_log[bc]["total"] += paired ? 2 : 1;
365
366 //
367 // If we have a perfectly matching barcode, set the barcode and length in the right places.
368 //
369 if (pe_bc.size() > 0 && valid_se_bc == true && valid_pe_bc == true) {
370 if (fhs.count(bc) > 0) {
371 if (paired) {
372 strcpy(href_1->se_bc, bc_1);
373 strcpy(href_2->pe_bc, bc_2);
374 } else {
375 strcpy(href_1->se_bc, bc_1);
376 strcpy(href_1->pe_bc, bc_2);
377 }
378
379 if (barcode_type == inline_index ||
380 barcode_type == inline_inline)
381 href_1->inline_bc_len = strlen(bc_1);
382 if (barcode_type == index_inline ||
383 barcode_type == inline_inline)
384 href_2->inline_bc_len = strlen(bc_2);
385 return 0;
386 }
387
388 } else if (valid_se_bc == true) {
389 strcpy(href_1->se_bc, bc_1);
390 if (barcode_type == inline_null ||
391 barcode_type == inline_index ||
392 barcode_type == inline_inline)
393 href_1->inline_bc_len = strlen(bc_1);
394
395 } else if (valid_pe_bc == true) {
396 if (paired)
397 strcpy(href_2->pe_bc, bc_2);
398 else
399 strcpy(href_1->pe_bc, bc_2);
400
401 if (barcode_type == index_inline ||
402 barcode_type == inline_inline)
403 href_2->inline_bc_len = strlen(bc_2);
404 }
405
406 //
407 // Try to correct the barcode.
408 //
409 BarcodePair old_barcode = bc;
410 bool se_correct = false;
411 bool pe_correct = false;
412
413 if (paired) {
414 if (se_bc.count(bc.se) == 0)
415 se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
416 if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
417 pe_correct = correct_barcode(pe_bc, href_2, paired_end, barcode_dist_2);
418
419 if (se_correct)
420 bc.se = string(href_1->se_bc);
421 if (pe_bc.size() > 0 && pe_correct)
422 bc.pe = string(href_2->pe_bc);
423
424 //
425 // After correcting the individual barcodes, check if the combination is valid.
426 //
427 if (fhs.count(bc) == 0) {
428 counter["ambiguous"] += 2;
429 href_1->retain = 0;
430 href_2->retain = 0;
431 }
432
433 } else {
434 if (se_bc.count(bc.se) == 0)
435 se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
436 if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
437 pe_correct = correct_barcode(pe_bc, href_1, paired_end, barcode_dist_2);
438
439 if (se_correct)
440 bc.se = string(href_1->se_bc);
441 if (pe_bc.size() > 0 && pe_correct)
442 bc.pe = string(href_1->pe_bc);
443
444 if (fhs.count(bc) == 0) {
445 counter["ambiguous"]++;
446 href_1->retain = 0;
447 }
448 }
449
450 if (href_1->retain && (se_correct || pe_correct)) {
451 counter["recovered"] += paired ? 2 : 1;
452 barcode_log[old_barcode]["total"] -= paired ? 2 : 1;
453 if (barcode_log.count(bc) == 0) {
454 barcode_log[bc]["total"] = 0;
455 barcode_log[bc]["retained"] = 0;
456 barcode_log[bc]["low_qual"] = 0;
457 barcode_log[bc]["noradtag"] = 0;
458 }
459 barcode_log[bc]["total"] += paired ? 2 : 1;
460 }
461
462 return 0;
463 }
464
465 #endif // __CLEAN_H__
466