1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011, Julian Catchen <jcatchen@uoregon.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 #ifndef __CLEAN_H__
22 #define __CLEAN_H__
23 
24 #include <string>
25 #include <map>
26 #include <iostream>
27 #include <fstream>
28 #include <unordered_map>
29 
30 #include "input.h"
31 #include "kmers.h"
32 
33 enum fastqt   {generic_fastq, illv1_fastq, illv2_fastq};
34 
35 enum barcodet {null_null,
36                null_inline,   null_index,
37                inline_null,   index_null,
38                inline_inline, index_index,
39                inline_index,  index_inline};
40 enum seqt {single_end, paired_end};
41 
42 typedef unordered_map<string, vector<int>, std::hash<string> > AdapterHash;
43 
44 extern uint     min_bc_size_1, max_bc_size_1, min_bc_size_2, max_bc_size_2;
45 extern int      barcode_dist_1, barcode_dist_2;
46 extern barcodet barcode_type;
47 extern uint     truncate_seq;
48 extern double   win_size;
49 extern bool     paired;
50 extern bool     recover;
51 
52 class BarcodePair {
53 public:
54     string se;    // Single-end barcode.
55     string pe;    // Paired-end barcode.
56     string name;  // Filename to open for this barcode combination.
57 
BarcodePair()58     BarcodePair()
59     {
60         this->se = "";
61         this->pe = "";
62     }
BarcodePair(char * p)63     BarcodePair(char *p)
64     {
65         this->se = string(p);
66         this->pe = "";
67     }
BarcodePair(char * p,char * q,char * n)68     BarcodePair(char *p, char *q, char *n)
69     {
70         if (p != NULL)
71             this->se = string(p);
72         if (q != NULL)
73             this->pe = string(q);
74         if (n != NULL)
75             this->name = string(n);
76     }
BarcodePair(string se,string pe,string name)77     BarcodePair(string se, string pe, string name)
78     {
79         this->se = se;
80         this->pe = pe;
81         this->name = name;
82     }
BarcodePair(string se)83     BarcodePair(string se)
84     {
85         this->se = se;
86         this->pe = "";
87     }
set(char * p,char * q)88     void set(char *p, char *q)
89     {
90         this->se = string(p);
91         this->pe = string(q);
92     }
set(char * p)93     void set(char *p)
94     {
95         this->se = string(p);
96         this->pe = "";
97     }
set(string p,string q)98     void set(string p, string q)
99     {
100         this->se = p;
101         this->pe = q;
102     }
set(string p)103     void set(string p)
104     {
105         this->se = p;
106         this->pe = "";
107     }
str()108     string str()
109     {
110         if (this->pe.length() > 0)
111             return string(this->se + "-" + this->pe);
112         else
113             return this->se;
114     }
name_exists()115     bool name_exists()
116     {
117         if (this->name.length() > 0)
118             return true;
119         return false;
120     }
121     friend bool operator<(const BarcodePair &lhs, const BarcodePair &rhs)
122     {
123         if (lhs.se < rhs.se)
124             return true;
125         else if (lhs.se == rhs.se && lhs.pe < rhs.pe)
126             return true;
127         else
128             return false;
129     }
130     friend bool operator==(const BarcodePair &lhs, const BarcodePair &rhs)
131     {
132         return (lhs.se == rhs.se && lhs.pe == rhs.pe);
133     }
134     friend ofstream& operator<<(ofstream &out, const BarcodePair &bp)
135     {
136         if (bp.pe.length() > 0)
137             out << bp.se << "-" << bp.pe;
138         else
139             out << bp.se;
140         return out;
141     }
142 };
143 
144 class RawRead {
145 public:
146     fastqt fastq_type;
147     char  *inline_bc;
148     char  *index_bc;
149     char  *se_bc;
150     char  *pe_bc;
151     char  *machine;
152     int    run;
153     int    lane;
154     int    tile;
155     int    x;
156     int    y;
157     int    index;
158     int    read;
159     char  *seq;
160     char  *phred;
161     int   *int_scores;
162     bool   filter;
163     int    inline_bc_len;
164     int    retain;
165     uint   size;
166     uint   len;
167     double win_len;
168     double stop_pos;
169 
RawRead(uint buf_len,int read,int barcode_size,double win_size)170     RawRead(uint buf_len, int read, int barcode_size, double win_size) {
171         this->inline_bc     = new char[id_len  + 1];
172         this->index_bc      = new char[id_len  + 1];
173         this->machine       = new char[id_len  + 1];
174         this->seq           = new char[buf_len + 1];
175         this->phred         = new char[buf_len + 1];
176         this->int_scores    = new  int[buf_len];
177         this->size          = buf_len + 1;
178         this->read          = read;
179 
180         this->retain        = 1;
181         this->inline_bc_len = 0;
182         this->tile          = 0;
183         this->run           = 0;
184         this->lane          = 0;
185         this->x             = 0;
186         this->y             = 0;
187         this->index         = 0;
188         this->len           = 0;
189 
190         this->inline_bc[0] = '\0';
191         this->index_bc[0]  = '\0';
192         this->machine[0]   = '\0';
193         this->seq[0]       = '\0';
194         this->phred[0]     = '\0';
195 
196         this->set_len(buf_len);
197 
198         this->se_bc = NULL;
199         this->pe_bc = NULL;
200         if (this->read == 1) {
201             switch(barcode_type) {
202             case index_inline:
203                 this->se_bc = this->index_bc;
204                 this->pe_bc = this->inline_bc;
205                 break;
206             case inline_index:
207                 this->se_bc = this->inline_bc;
208                 this->pe_bc = this->index_bc;
209                 this->inline_bc_len = barcode_size;
210                 break;
211             case inline_null:
212             case inline_inline:
213                 this->se_bc = this->inline_bc;
214                 this->inline_bc_len = barcode_size;
215                 break;
216             case index_null:
217             case index_index:
218                 this->se_bc = this->index_bc;
219                 break;
220             default:
221                 break;
222             }
223         } else if (this->read == 2) {
224             switch(barcode_type) {
225             case null_inline:
226             case inline_inline:
227             case index_inline:
228                 this->pe_bc = this->inline_bc;
229                 this->inline_bc_len = barcode_size;
230                 break;
231             case index_index:
232             case inline_index:
233                 this->pe_bc = this->index_bc;
234                 break;
235             default:
236                 break;
237             }
238         }
239     }
~RawRead()240     ~RawRead() {
241         delete [] this->inline_bc;
242         delete [] this->index_bc;
243         delete [] this->machine;
244         delete [] this->seq;
245         delete [] this->phred;
246         delete [] this->int_scores;
247     }
resize(int size)248     int resize(int size) {
249         delete [] this->seq;
250         delete [] this->phred;
251         delete [] this->int_scores;
252         this->size  = size;
253         this->seq   = new char[this->size];
254         this->phred = new char[this->size];
255         this->int_scores = new int[this->size - 1];
256 
257         this->set_len(size - 1);
258 
259         return 0;
260     }
set_len(uint buf_len)261     int set_len(uint buf_len) {
262         if (buf_len == this->len)
263             return 0;
264 
265         if (buf_len > this->size - 1)
266             buf_len = this->size - 1;
267 
268         this->seq[buf_len]   = '\0';
269         this->phred[buf_len] = '\0';
270 
271         //
272         // Set the parameters for checking read quality later in processing.
273         // Window length is 15% (rounded) of the sequence length.
274         //
275         this->len      = buf_len - this->inline_bc_len;
276         this->win_len  = round((double) this->len * win_size);
277 
278         if (this->win_len < 1)
279             this->win_len = 1;
280 
281         this->len     += this->inline_bc_len;
282         this->stop_pos = this->len - this->win_len;
283 
284         return 0;
285     }
286 };
287 
288 int  parse_illumina_v1(const char *);
289 int  parse_illumina_v2(const char *);
290 int  parse_input_record(Seq *, RawRead *);
291 int  rev_complement(char *, int, bool);
292 int  reverse_qual(char *, int, bool);
293 
294 bool correct_barcode(set<string> &, RawRead *, seqt, int);
295 
296 int  filter_adapter_seq(RawRead *, char *, int, AdapterHash &, int, int, int);
297 int  init_adapter_seq(int, char *, int &, AdapterHash &);
298 
299 int  check_quality_scores(RawRead *, int, int, int, int);
300 
301 //
302 // Templated function to process barcodes.
303 //
304 template<typename fhType>
305 int
process_barcode(RawRead * href_1,RawRead * href_2,BarcodePair & bc,map<BarcodePair,fhType * > & fhs,set<string> & se_bc,set<string> & pe_bc,map<BarcodePair,map<string,long>> & barcode_log,map<string,long> & counter)306 process_barcode(RawRead *href_1, RawRead *href_2, BarcodePair &bc,
307                 map<BarcodePair, fhType *> &fhs,
308                 set<string> &se_bc, set<string> &pe_bc,
309                 map<BarcodePair, map<string, long> > &barcode_log, map<string, long> &counter)
310 {
311     if (barcode_type == null_null)
312         return 0;
313 
314     //
315     // Is this a legitimate barcode? The barcode passed into this function is the maximally long
316     // barcode. If we fail to find a match at maximum length, step down to minimum length and
317     // continue to search for a match.
318     //
319     char *p;
320     char  bc_1[id_len];
321     char  bc_2[id_len];
322     strcpy(bc_1, bc.se.c_str());
323     strcpy(bc_2, bc.pe.c_str());
324 
325     bool valid_se_bc = false;
326     bool valid_pe_bc = false;
327 
328     p = bc_1 + max_bc_size_1; // Point p at the end of string NULL.
329     for (uint i = max_bc_size_1; i >= min_bc_size_1; i--)
330         if (se_bc.count(bc_1) > 0) {
331             valid_se_bc = true;
332             break;
333         } else {
334             p--;
335             *p = '\0';
336         }
337     if (pe_bc.size() > 0) {
338         p = bc_2 + max_bc_size_2; // Point p at the end of string NULL.
339         for (uint i = max_bc_size_2; i >= min_bc_size_2; i--)
340             if (pe_bc.count(bc_2) > 0) {
341                 valid_pe_bc = true;
342                 break;
343             } else {
344                 p--;
345                 *p = '\0';
346             }
347     }
348     if (valid_se_bc == true && valid_pe_bc == true)
349         bc.set(bc_1, bc_2);
350     else if (valid_se_bc == true)
351         bc.se = bc_1;
352     else if (valid_pe_bc == true)
353         bc.pe = bc_2;
354 
355     //
356     // Log the barcodes we receive.
357     //
358     if (barcode_log.count(bc) == 0) {
359         barcode_log[bc]["noradtag"] = 0;
360         barcode_log[bc]["total"]    = 0;
361         barcode_log[bc]["low_qual"] = 0;
362         barcode_log[bc]["retained"] = 0;
363     }
364     barcode_log[bc]["total"] += paired ? 2 : 1;
365 
366     //
367     // If we have a perfectly matching barcode, set the barcode and length in the right places.
368     //
369     if (pe_bc.size() > 0 && valid_se_bc == true && valid_pe_bc == true) {
370         if (fhs.count(bc) > 0) {
371             if (paired) {
372                 strcpy(href_1->se_bc, bc_1);
373                 strcpy(href_2->pe_bc, bc_2);
374             } else {
375                 strcpy(href_1->se_bc, bc_1);
376                 strcpy(href_1->pe_bc, bc_2);
377             }
378 
379             if (barcode_type == inline_index ||
380                 barcode_type == inline_inline)
381                 href_1->inline_bc_len = strlen(bc_1);
382             if (barcode_type == index_inline ||
383                 barcode_type == inline_inline)
384                 href_2->inline_bc_len = strlen(bc_2);
385             return 0;
386         }
387 
388     } else if (valid_se_bc == true) {
389         strcpy(href_1->se_bc, bc_1);
390         if (barcode_type == inline_null ||
391             barcode_type == inline_index ||
392             barcode_type == inline_inline)
393             href_1->inline_bc_len = strlen(bc_1);
394 
395     } else if (valid_pe_bc == true) {
396         if (paired)
397             strcpy(href_2->pe_bc, bc_2);
398         else
399             strcpy(href_1->pe_bc, bc_2);
400 
401         if (barcode_type == index_inline ||
402             barcode_type == inline_inline)
403             href_2->inline_bc_len = strlen(bc_2);
404     }
405 
406     //
407     // Try to correct the barcode.
408     //
409     BarcodePair old_barcode = bc;
410     bool se_correct = false;
411     bool pe_correct = false;
412 
413     if (paired) {
414         if (se_bc.count(bc.se) == 0)
415             se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
416         if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
417             pe_correct = correct_barcode(pe_bc, href_2, paired_end, barcode_dist_2);
418 
419         if (se_correct)
420             bc.se = string(href_1->se_bc);
421         if (pe_bc.size() > 0 && pe_correct)
422             bc.pe = string(href_2->pe_bc);
423 
424         //
425         // After correcting the individual barcodes, check if the combination is valid.
426         //
427         if (fhs.count(bc) == 0) {
428             counter["ambiguous"] += 2;
429             href_1->retain = 0;
430             href_2->retain = 0;
431         }
432 
433     } else {
434         if (se_bc.count(bc.se) == 0)
435             se_correct = correct_barcode(se_bc, href_1, single_end, barcode_dist_1);
436         if (pe_bc.size() > 0 && pe_bc.count(bc.pe) == 0)
437             pe_correct = correct_barcode(pe_bc, href_1, paired_end, barcode_dist_2);
438 
439         if (se_correct)
440             bc.se = string(href_1->se_bc);
441         if (pe_bc.size() > 0 && pe_correct)
442             bc.pe = string(href_1->pe_bc);
443 
444         if (fhs.count(bc) == 0) {
445             counter["ambiguous"]++;
446             href_1->retain = 0;
447         }
448     }
449 
450     if (href_1->retain && (se_correct || pe_correct)) {
451         counter["recovered"] += paired ? 2 : 1;
452         barcode_log[old_barcode]["total"] -= paired ? 2 : 1;
453         if (barcode_log.count(bc) == 0) {
454             barcode_log[bc]["total"]    = 0;
455             barcode_log[bc]["retained"] = 0;
456             barcode_log[bc]["low_qual"] = 0;
457             barcode_log[bc]["noradtag"] = 0;
458         }
459         barcode_log[bc]["total"] += paired ? 2 : 1;
460     }
461 
462     return 0;
463 }
464 
465 #endif // __CLEAN_H__
466