1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks.  If not, see <http://www.gnu.org/licenses/>.
19 //
20 
21 //
22 // clean.cc -- common routines for processing and cleaning raw seqeunce data.
23 //
24 // Julian Catchen
25 // jcatchen@uoregon.edu
26 // University of Oregon
27 //
28 
29 #include "clean.h"
30 
parse_illumina_v1(const char * file)31 int parse_illumina_v1(const char *file) {
32     const char *p, *q;
33 
34     //
35     // Parse a file name that looks like: s_7_1_0001_qseq.txt ... s_7_1_0120_qseq.txt
36     // but exclude the paired-end files:  s_7_2_0001_qseq.txt ... s_7_2_0120_qseq.txt
37     //
38     if (file[0] != 's')
39         return 0;
40 
41     int underscore_cnt = 0;
42     for (p = file; *p != '\0'; p++) {
43         if (*p == '_') {
44             underscore_cnt++;
45             q = p;
46         }
47     }
48 
49     if (underscore_cnt != 4)
50         return 0;
51 
52     // Check the file suffix.
53     if (strncmp(q, "_qseq.txt", 8) != 0)
54         return 0;
55 
56     // Make sure it is not the paired-end file
57     p  = file;
58     p += 3;
59     if (strncmp(p, "_1_", 3) != 0)
60         return 0;
61 
62     //
63     // Return the position of the paired-end number, so the other file name can be generated.
64     //
65     return (p + 1 - file);
66 }
67 
parse_illumina_v2(const char * file)68 int parse_illumina_v2(const char *file) {
69     const char *p, *q;
70 
71     //
72     // Parse a file name that looks like: lane6_NoIndex_L006_R1_003.fastq
73     // but exclude the paired-end files:  lane6_NoIndex_L006_R2_003.fastq
74     //
75     // Another example could be:          GfddRAD1_001_ATCACG_L008_R1_001.fastq.gz
76     // and excluding the paired-end file: GfddRAD1_001_ATCACG_L008_R2_001.fastq.gz
77 
78     //
79     // Make sure it ends in "fastq" or "fastq.gz"
80     //
81     for (q = file; *q != '\0'; q++);
82     for (p = q; *p != '.' && p > file; p--);
83     if (strncmp(p, ".gz", 3) == 0)
84         for (p--; *p != '.' && p > file; p--);
85     if (strncmp(p, ".fastq", 6) != 0)
86         return 0;
87 
88     //
89     // Find the part of the name marking the pair, "_R1_", make sure it is not the paired-end file.
90     //
91     p = file;
92     while (*p != '\0') {
93         for (; *p != '_' && *p != '\0'; p++);
94         if (*p == '\0') return 0;
95         if (strncmp(p, "_R1_", 4) == 0) {
96             //
97             // Return the position of the paired-end number, so the other file name can be generated.
98             //
99             return (p + 2 - file);
100         }
101         p++;
102     }
103 
104     return 0;
105 }
106 
107 int
parse_input_record(Seq * s,RawRead * r)108 parse_input_record(Seq *s, RawRead *r)
109 {
110     char *p, *q, *z;
111     uint  lim;
112     //
113     // Count the number of colons to differentiate Illumina version.
114     // CASAVA 1.8+ has a FASTQ header like this:
115     //  @HWI-ST0747:155:C01WHABXX:8:1101:6455:26332 1:N:0:
116     // Or, with the embedded barcode:
117     //  @HWI-ST1233:67:D12GNACXX:7:2307:14604:78978 1:N:0:ATCACG
118     //
119     //
120     // Or, parse FASTQ header from previous versions that looks like this:
121     //  @HWI-ST0747_0141:4:1101:1240:2199#0/1
122     //  @HWI-ST0747_0143:2:2208:21290:200914#0/1
123     //
124     char *stop = s->id + strlen(s->id);
125     int   colon_cnt = 0;
126     int   hash_cnt  = 0;
127 
128     for (p = s->id, q = p; q < stop; q++) {
129         colon_cnt += *q == ':' ? 1 : 0;
130         hash_cnt  += *q == '#' ? 1 : 0;
131     }
132 
133     if (colon_cnt == 9 && hash_cnt == 0) {
134         r->fastq_type = illv2_fastq;
135         //
136         // According to Illumina manual, "CASAVA v1.8 User Guide" page 41:
137         // @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>
138         //
139         for (p = s->id, q = p; *q != ':' && q < stop; q++);
140         if (q < stop) {
141             *q = '\0';
142             strcpy(r->machine, p);
143             *q = ':';
144         }
145 
146         // Run number.
147         for (p = q+1, q = p; *q != ':' && q < stop; q++);
148         if (q < stop) {
149             *q = '\0';
150             r->run = atoi(p);
151             *q = ':';
152         }
153 
154         // Flowcell ID.
155         for (p = q+1, q = p; *q != ':' && q < stop; q++);
156         //*q = '\0';
157 
158         for (p = q+1, q = p; *q != ':' && q < stop; q++);
159         if (q < stop) {
160             *q = '\0';
161             r->lane = atoi(p);
162             *q = ':';
163         }
164 
165         for (p = q+1, q = p; *q != ':' && q < stop; q++);
166         if (q < stop) {
167             *q = '\0';
168             r->tile = atoi(p);
169             *q = ':';
170         }
171 
172         for (p = q+1, q = p; *q != ':' && q < stop; q++);
173         if (q < stop) {
174             *q = '\0';
175             r->x = atoi(p);
176             *q = ':';
177         }
178 
179         for (p = q+1, q = p; *q != ' ' && q < stop; q++);
180         if (q < stop) {
181             *q = '\0';
182             r->y = atoi(p);
183             *q = ' ';
184         }
185 
186         for (p = q+1, q = p; *q != ':' && q < stop; q++);
187         if (q < stop) {
188             *q = '\0';
189             // r->read = atoi(p);
190             *q = ':';
191         }
192 
193         for (p = q+1, q = p; *q != ':' && q < stop; q++);
194         if (q < stop) {
195             *q = '\0';
196             r->filter = *p == 'Y' ? true : false;
197             *q = ':';
198         }
199 
200         // Control Number.
201         for (p = q+1, q = p; *q != ':' && q < stop; q++);
202         //*q = '\0';
203 
204         //
205         // Index barcode
206         //
207         // The index barcode appears identically in both single-end and paired-end reads.
208         // If the barcode type is index_index, the barcode will appear as NNNNNN+NNNNNN
209         // in both reads. If the specified barcode type is null_index we want to read only
210         // the second half of the index, if the type is index_null, we want to read
211         // only the first half, or the full string if there is no '+' character.
212         //
213         if (q < stop)
214             for (p = q+1, q = p; q < stop; q++);
215         else
216             p = q;
217 
218         if (*p != '\0') {
219             //
220             // Check if there is a '+' character.
221             //
222             for (z = p; *z != '+' && *z != '\0'; z++);
223 
224             if (r->read == 1) {
225                 lim = z - p;
226 
227                 switch (barcode_type) {
228                 case index_null:
229                 case index_index:
230                 case index_inline:
231                     lim = lim < max_bc_size_1 ? lim : max_bc_size_1;
232                     strncpy(r->index_bc, p, lim);
233                     r->index_bc[lim] = '\0';
234                     break;
235                 case inline_index:
236                     lim = lim < max_bc_size_2 ? lim : max_bc_size_2;
237                     strncpy(r->index_bc, p, lim);
238                     r->index_bc[lim] = '\0';
239                     break;
240                 default:
241                     break;
242                 }
243             } else if (r->read == 2) {
244                 if (*z == '+')
245                     p = z + 1;
246 
247                 switch (barcode_type) {
248                 case null_index:
249                 case index_index:
250                 case inline_index:
251                     strncpy(r->index_bc, p,  max_bc_size_2);
252                     r->index_bc[max_bc_size_2] = '\0';
253                     break;
254                 default:
255                     break;
256                 }
257             }
258         }
259 
260     } else if (colon_cnt == 4 && hash_cnt == 1) {
261         r->fastq_type = illv1_fastq;
262 
263         for (p = s->id, q = p; *q != ':' && q < stop; q++);
264         if (q < stop) {
265             *q = '\0';
266             strcpy(r->machine, p);
267             *q = ':';
268         }
269 
270         for (p = q+1, q = p; *q != ':' && q < stop; q++);
271         if (q < stop) {
272             *q = '\0';
273             r->lane = atoi(p);
274             *q = ':';
275         }
276 
277         for (p = q+1, q = p; *q != ':' && q < stop; q++);
278         if (q < stop) {
279             *q = '\0';
280             r->tile = atoi(p);
281             *q = ':';
282         }
283 
284         for (p = q+1, q = p; *q != ':' && q < stop; q++);
285         if (q < stop) {
286             *q = '\0';
287             r->x = atoi(p);
288             *q = ':';
289         }
290 
291         for (p = q+1, q = p; *q != '#' && q < stop; q++);
292         if (q < stop) {
293             *q = '\0';
294             r->y = atoi(p);
295             *q = '#';
296         }
297 
298         for (p = q+1, q = p; *q != '/' && q < stop; q++);
299         if (q < stop) {
300             *q = '\0';
301             r->index = atoi(p);
302             *q = '/';
303         }
304 
305         for (p = q+1, q = p; *q != '\0' && q < stop; q++);
306         // r->read = atoi(p);
307 
308     } else {
309         r->fastq_type = generic_fastq;
310 
311         strncpy(r->machine, s->id, id_len);
312         r->machine[id_len] = '\0';
313     }
314 
315     uint len = strlen(s->seq);
316 
317     //
318     // Resize the sequence/phred buffers if necessary.
319     //
320     if (len > r->size - 1)
321         r->resize(len + 1);
322 
323     strncpy(r->seq,   s->seq,  r->size - 1);
324     strncpy(r->phred, s->qual, r->size - 1);
325     r->seq[r->size - 1]   = '\0';
326     r->phred[r->size - 1] = '\0';
327     r->len = len;
328 
329     if (r->read == 1) {
330         switch (barcode_type) {
331         case inline_null:
332         case inline_inline:
333         case inline_index:
334             strncpy(r->inline_bc, r->seq, max_bc_size_1);
335             r->inline_bc[max_bc_size_1] = '\0';
336             break;
337         case index_inline:
338             strncpy(r->inline_bc, r->seq, max_bc_size_2);
339             r->inline_bc[max_bc_size_2] = '\0';
340             break;
341         default:
342             break;
343         }
344     } else if (r->read == 2) {
345         switch (barcode_type) {
346         case null_inline:
347         case inline_inline:
348         case index_inline:
349             strncpy(r->inline_bc, r->seq, max_bc_size_2);
350             r->inline_bc[max_bc_size_2] = '\0';
351             break;
352         default:
353             break;
354         }
355     }
356 
357     r->retain = 1;
358 
359     return 0;
360 }
361 
362 int
rev_complement(char * seq,int offset,bool overhang)363 rev_complement(char *seq, int offset, bool overhang)
364 {
365     char *p, *q;
366 
367     offset += overhang ? 1 : 0;
368     q       = seq + offset;
369 
370     int len   = strlen(q);
371     int j     = 0;
372     char *com = new char[len + 1];
373 
374     for (p = q + len - 1; p >= q; p--) {
375         switch (*p) {
376         case 'A':
377         case 'a':
378             com[j] = 'T';
379             break;
380         case 'C':
381         case 'c':
382             com[j] = 'G';
383             break;
384         case 'G':
385         case 'g':
386             com[j] = 'C';
387             break;
388         case 'T':
389         case 't':
390             com[j] = 'A';
391             break;
392         }
393         j++;
394     }
395     com[len] = '\0';
396 
397     for (j = 0; j < len; j++)
398         q[j] = com[j];
399 
400     delete [] com;
401 
402     return 0;
403 }
404 
405 int
reverse_qual(char * qual,int offset,bool overhang)406 reverse_qual(char *qual, int offset, bool overhang)
407 {
408     char *p, *q;
409 
410     offset += overhang ? 1 : 0;
411     q       = qual + offset;
412 
413     int len   = strlen(q);
414     int j     = 0;
415     char *com = new char[len + 1];
416 
417     for (p = q + len - 1; p >= q; p--) {
418         com[j] = *p;
419         j++;
420     }
421     com[len] = '\0';
422 
423     for (j = 0; j < len; j++)
424         q[j] = com[j];
425 
426     delete [] com;
427 
428     return 0;
429 }
430 
431 //
432 // Functions for quality filtering based on phred scores.
433 //
434 int
check_quality_scores(RawRead * href,int qual_offset,int score_limit,int len_limit,int offset)435 check_quality_scores(RawRead *href, int qual_offset, int score_limit, int len_limit, int offset)
436 {
437     //
438     // Phred quality scores are discussed here:
439     //  http://en.wikipedia.org/wiki/FASTQ_format
440     //
441     // Illumina 1.3+ encodes phred scores between ASCII values 64 (0 quality) and 104 (40 quality)
442     //
443     //   @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
444     //   |         |         |         |         |
445     //  64        74        84        94       104
446     //   0        10(90%)   20(99%)   30(99.9%) 40(99.99%)
447     //
448     //
449     // Drop sequence if the average phred quality score drops below a threshold within a sliding window.
450     //
451 
452     //
453     // Convert the encoded quality scores to their integer values
454     //
455     // cerr << "Integer scores: ";
456     for (uint j = 0; j < href->len; j++) {
457         href->int_scores[j] = href->phred[j] - qual_offset;
458         // cerr << href->int_scores[j] << " ";
459     }
460     // cerr << "\n";
461 
462     double mean        = 0.0;
463     double working_sum = 0.0;
464     int *p, *q, j;
465 
466     // cerr << "Window length: " << href->win_len << "; Stop position: " << href->stop_pos << "\n";
467     //
468     // Populate the sliding window.
469     //
470     for (j = offset; j < href->win_len + offset; j++)
471         working_sum += href->int_scores[j];
472 
473     // cerr << "Populating the sliding window using position " << offset << " to " << href->win_len + offset - 1 << "; initial working sum: " << working_sum << "\n";
474 
475     //
476     // Set p pointer to the first element in the window, and q to one element past the last element in the window.
477     //
478     p = href->int_scores + offset;
479     q = p + (int) href->win_len;
480     j = offset;
481 
482     // cerr << "Setting pointers; P: " << (href->int_scores + offset) - href->int_scores << "; Q: " << p + (int) href->win_len - p << "; J: " << j << "\n";
483 
484     do {
485         mean = working_sum / href->win_len;
486 
487         // cerr << "J: " << j << "; Window contents: ";
488         // for (int *r = p; r < q; r++)
489         //     cerr << *r << " ";
490         // cerr << "\n";
491         // cerr << "    Mean: " << mean << "\n";
492 
493         if (mean < score_limit) {
494 
495             if (j < len_limit) {
496                 return 0;
497             } else {
498                 href->len      = j + 1;
499                 href->seq[j]   = '\0';
500                 href->phred[j] = '\0';
501                 return -1;
502             }
503         }
504 
505         //
506         // Advance the window:
507         //   Add the score from the front edge of the window, subtract the score
508         //   from the back edge of the window.
509         //
510         working_sum -= (double) *p;
511         working_sum += (double) *q;
512 
513         // cerr << "  Removing value of p: " << *p << " (position: " << p - (href->int_scores) << ")\n";
514         // cerr << "  Adding value of q: " << *q << " (position: " << q - (href->int_scores) << ")\n";
515 
516         p++;
517         q++;
518         j++;
519     } while (j <= href->stop_pos);
520 
521     return 1;
522 }
523 
524 bool
correct_barcode(set<string> & bcs,RawRead * href,seqt type,int num_errs)525 correct_barcode(set<string> &bcs, RawRead *href, seqt type, int num_errs)
526 {
527     if (recover == false)
528         return false;
529 
530     //
531     // The barcode_dist variable specifies how far apart in sequence space barcodes are. If barcodes
532     // are off by two nucleotides in sequence space, than we can correct barcodes that have a single
533     // sequencing error.
534     //
535     // If the barcode sequence is off by no more than barcodes_dist-1 nucleotides, correct it. We will
536     // search the whole possible space of barcodes if more than one length of barcode was specified.
537     //
538     const char *p; char *q;
539     char   bc[id_len];
540     int    d, close;
541     string b;
542     set<string>::iterator it;
543 
544     close = 0;
545 
546     for (it = bcs.begin(); it != bcs.end(); it++) {
547 
548         //
549         // Copy the proper subset of the barcode to match the length of the barcode in the bcs set.
550         //
551         strncpy(bc, type == single_end ? href->se_bc : href->pe_bc, it->length());
552         bc[it->length()] = '\0';
553 
554         d = 0;
555         for (p = it->c_str(), q = bc; *p != '\0'; p++, q++)
556             if (*p != *q) d++;
557 
558         if (d <= num_errs) {
559             close++;
560             b = *it;
561             break;
562         }
563     }
564 
565     if (close == 1) {
566         //
567         // Correct the barcode.
568         //
569         if (type == single_end) {
570             strcpy(href->se_bc, b.c_str());
571             if (barcode_type == inline_null ||
572                 barcode_type == inline_index ||
573                 barcode_type == inline_inline)
574                 href->inline_bc_len = b.length();
575         } else {
576             strcpy(href->pe_bc, b.c_str());
577             if (barcode_type == index_inline ||
578                 barcode_type == inline_inline)
579                 href->inline_bc_len = b.length();
580         }
581 
582         return true;
583     }
584 
585     return false;
586 }
587 
588 //
589 // Functions for filtering adapter sequence
590 //
591 int
init_adapter_seq(int kmer_size,char * adapter,int & adp_len,AdapterHash & kmers)592 init_adapter_seq(int kmer_size, char *adapter, int &adp_len, AdapterHash &kmers)
593 {
594     string kmer;
595     adp_len = strlen(adapter);
596 
597     int   num_kmers = adp_len - kmer_size + 1;
598     char *p = adapter;
599     for (int i = 0; i < num_kmers; i++) {
600         kmer.assign(p, kmer_size);
601         kmers[kmer].push_back(i);
602         p++;
603     }
604 
605     return 0;
606 }
607 
608 int
filter_adapter_seq(RawRead * href,char * adapter,int adp_len,AdapterHash & adp_kmers,int kmer_size,int distance,int len_limit)609 filter_adapter_seq(RawRead *href, char *adapter, int adp_len, AdapterHash &adp_kmers,
610                    int kmer_size, int distance, int len_limit)
611 {
612     vector<pair<int, int> > hits;
613     int   num_kmers = href->len - kmer_size + 1;
614     const char *p   = href->seq;
615     string kmer;
616 
617     //
618     // Identify matching kmers and their locations of occurance.
619     //
620     for (int i = 0; i < num_kmers; i++) {
621         kmer.assign(p, kmer_size);
622 
623         if (adp_kmers.count(kmer) > 0) {
624             for (uint j = 0; j < adp_kmers[kmer].size(); j++) {
625                 // cerr << "Kmer hit " << kmer << " at query position " << i << " at hit position " << adp_kmers[kmer][j] << "\n";
626                 hits.push_back(make_pair(i, adp_kmers[kmer][j]));
627             }
628         }
629         p++;
630     }
631 
632     //
633     // Scan backwards from the position of the k-mer and then scan forwards
634     // counting the number of mismatches.
635     //
636     int mismatches, i, j, start_pos;
637 
638     for (uint k = 0; k < hits.size(); k++) {
639         mismatches = 0;
640         i = hits[k].first;  // Position in query sequence
641         j = hits[k].second; // Position in adapter hit
642 
643         // cerr << "Starting comparison at i: "<< i << "; j: " << j << "\n";
644 
645         while (i >= 0 && j >= 0) {
646             if (href->seq[i] != adapter[j])
647                 mismatches++;
648             i--;
649             j--;
650         }
651 
652         if (mismatches > distance)
653             continue;
654 
655         start_pos = i + 1;
656         i = hits[k].first;
657         j = hits[k].second;
658 
659         while (i < (int) href->len && j < adp_len && mismatches <= distance) {
660             if (href->seq[i] != adapter[j])
661                 mismatches++;
662             i++;
663             j++;
664         }
665 
666         // cerr << "Starting position: " << start_pos << "; Query end (i): " << i << "; adapter end (j): " << j
667         //      << "; number of mismatches: " << mismatches << "; Seq Len: " << href->len << "; SeqSeq Len: " << strlen(href->seq) << "\n";
668 
669         if (mismatches <= distance && (i == (int) href->len || j == adp_len)) {
670             // cerr << "  Trimming or dropping.\n";
671             if (start_pos < len_limit) {
672                 return 0;
673             } else {
674                 href->len = start_pos + 1;
675                 href->seq[start_pos]   = '\0';
676                 href->phred[start_pos] = '\0';
677                 return -1;
678             }
679         }
680     }
681 
682     return 1;
683 }
684