1 // -*-mode:c++; c-style:k&r; c-basic-offset:4;-*-
2 //
3 // Copyright 2011-2018, Julian Catchen <jcatchen@illinois.edu>
4 //
5 // This file is part of Stacks.
6 //
7 // Stacks is free software: you can redistribute it and/or modify
8 // it under the terms of the GNU General Public License as published by
9 // the Free Software Foundation, either version 3 of the License, or
10 // (at your option) any later version.
11 //
12 // Stacks is distributed in the hope that it will be useful,
13 // but WITHOUT ANY WARRANTY; without even the implied warranty of
14 // MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 // GNU General Public License for more details.
16 //
17 // You should have received a copy of the GNU General Public License
18 // along with Stacks. If not, see <http://www.gnu.org/licenses/>.
19 //
20
21 //
22 // clean.cc -- common routines for processing and cleaning raw seqeunce data.
23 //
24 // Julian Catchen
25 // jcatchen@uoregon.edu
26 // University of Oregon
27 //
28
29 #include "clean.h"
30
parse_illumina_v1(const char * file)31 int parse_illumina_v1(const char *file) {
32 const char *p, *q;
33
34 //
35 // Parse a file name that looks like: s_7_1_0001_qseq.txt ... s_7_1_0120_qseq.txt
36 // but exclude the paired-end files: s_7_2_0001_qseq.txt ... s_7_2_0120_qseq.txt
37 //
38 if (file[0] != 's')
39 return 0;
40
41 int underscore_cnt = 0;
42 for (p = file; *p != '\0'; p++) {
43 if (*p == '_') {
44 underscore_cnt++;
45 q = p;
46 }
47 }
48
49 if (underscore_cnt != 4)
50 return 0;
51
52 // Check the file suffix.
53 if (strncmp(q, "_qseq.txt", 8) != 0)
54 return 0;
55
56 // Make sure it is not the paired-end file
57 p = file;
58 p += 3;
59 if (strncmp(p, "_1_", 3) != 0)
60 return 0;
61
62 //
63 // Return the position of the paired-end number, so the other file name can be generated.
64 //
65 return (p + 1 - file);
66 }
67
parse_illumina_v2(const char * file)68 int parse_illumina_v2(const char *file) {
69 const char *p, *q;
70
71 //
72 // Parse a file name that looks like: lane6_NoIndex_L006_R1_003.fastq
73 // but exclude the paired-end files: lane6_NoIndex_L006_R2_003.fastq
74 //
75 // Another example could be: GfddRAD1_001_ATCACG_L008_R1_001.fastq.gz
76 // and excluding the paired-end file: GfddRAD1_001_ATCACG_L008_R2_001.fastq.gz
77
78 //
79 // Make sure it ends in "fastq" or "fastq.gz"
80 //
81 for (q = file; *q != '\0'; q++);
82 for (p = q; *p != '.' && p > file; p--);
83 if (strncmp(p, ".gz", 3) == 0)
84 for (p--; *p != '.' && p > file; p--);
85 if (strncmp(p, ".fastq", 6) != 0)
86 return 0;
87
88 //
89 // Find the part of the name marking the pair, "_R1_", make sure it is not the paired-end file.
90 //
91 p = file;
92 while (*p != '\0') {
93 for (; *p != '_' && *p != '\0'; p++);
94 if (*p == '\0') return 0;
95 if (strncmp(p, "_R1_", 4) == 0) {
96 //
97 // Return the position of the paired-end number, so the other file name can be generated.
98 //
99 return (p + 2 - file);
100 }
101 p++;
102 }
103
104 return 0;
105 }
106
107 int
parse_input_record(Seq * s,RawRead * r)108 parse_input_record(Seq *s, RawRead *r)
109 {
110 char *p, *q, *z;
111 uint lim;
112 //
113 // Count the number of colons to differentiate Illumina version.
114 // CASAVA 1.8+ has a FASTQ header like this:
115 // @HWI-ST0747:155:C01WHABXX:8:1101:6455:26332 1:N:0:
116 // Or, with the embedded barcode:
117 // @HWI-ST1233:67:D12GNACXX:7:2307:14604:78978 1:N:0:ATCACG
118 //
119 //
120 // Or, parse FASTQ header from previous versions that looks like this:
121 // @HWI-ST0747_0141:4:1101:1240:2199#0/1
122 // @HWI-ST0747_0143:2:2208:21290:200914#0/1
123 //
124 char *stop = s->id + strlen(s->id);
125 int colon_cnt = 0;
126 int hash_cnt = 0;
127
128 for (p = s->id, q = p; q < stop; q++) {
129 colon_cnt += *q == ':' ? 1 : 0;
130 hash_cnt += *q == '#' ? 1 : 0;
131 }
132
133 if (colon_cnt == 9 && hash_cnt == 0) {
134 r->fastq_type = illv2_fastq;
135 //
136 // According to Illumina manual, "CASAVA v1.8 User Guide" page 41:
137 // @<instrument>:<run number>:<flowcell ID>:<lane>:<tile>:<x-pos>:<y-pos> <read>:<is filtered>:<control number>:<index sequence>
138 //
139 for (p = s->id, q = p; *q != ':' && q < stop; q++);
140 if (q < stop) {
141 *q = '\0';
142 strcpy(r->machine, p);
143 *q = ':';
144 }
145
146 // Run number.
147 for (p = q+1, q = p; *q != ':' && q < stop; q++);
148 if (q < stop) {
149 *q = '\0';
150 r->run = atoi(p);
151 *q = ':';
152 }
153
154 // Flowcell ID.
155 for (p = q+1, q = p; *q != ':' && q < stop; q++);
156 //*q = '\0';
157
158 for (p = q+1, q = p; *q != ':' && q < stop; q++);
159 if (q < stop) {
160 *q = '\0';
161 r->lane = atoi(p);
162 *q = ':';
163 }
164
165 for (p = q+1, q = p; *q != ':' && q < stop; q++);
166 if (q < stop) {
167 *q = '\0';
168 r->tile = atoi(p);
169 *q = ':';
170 }
171
172 for (p = q+1, q = p; *q != ':' && q < stop; q++);
173 if (q < stop) {
174 *q = '\0';
175 r->x = atoi(p);
176 *q = ':';
177 }
178
179 for (p = q+1, q = p; *q != ' ' && q < stop; q++);
180 if (q < stop) {
181 *q = '\0';
182 r->y = atoi(p);
183 *q = ' ';
184 }
185
186 for (p = q+1, q = p; *q != ':' && q < stop; q++);
187 if (q < stop) {
188 *q = '\0';
189 // r->read = atoi(p);
190 *q = ':';
191 }
192
193 for (p = q+1, q = p; *q != ':' && q < stop; q++);
194 if (q < stop) {
195 *q = '\0';
196 r->filter = *p == 'Y' ? true : false;
197 *q = ':';
198 }
199
200 // Control Number.
201 for (p = q+1, q = p; *q != ':' && q < stop; q++);
202 //*q = '\0';
203
204 //
205 // Index barcode
206 //
207 // The index barcode appears identically in both single-end and paired-end reads.
208 // If the barcode type is index_index, the barcode will appear as NNNNNN+NNNNNN
209 // in both reads. If the specified barcode type is null_index we want to read only
210 // the second half of the index, if the type is index_null, we want to read
211 // only the first half, or the full string if there is no '+' character.
212 //
213 if (q < stop)
214 for (p = q+1, q = p; q < stop; q++);
215 else
216 p = q;
217
218 if (*p != '\0') {
219 //
220 // Check if there is a '+' character.
221 //
222 for (z = p; *z != '+' && *z != '\0'; z++);
223
224 if (r->read == 1) {
225 lim = z - p;
226
227 switch (barcode_type) {
228 case index_null:
229 case index_index:
230 case index_inline:
231 lim = lim < max_bc_size_1 ? lim : max_bc_size_1;
232 strncpy(r->index_bc, p, lim);
233 r->index_bc[lim] = '\0';
234 break;
235 case inline_index:
236 lim = lim < max_bc_size_2 ? lim : max_bc_size_2;
237 strncpy(r->index_bc, p, lim);
238 r->index_bc[lim] = '\0';
239 break;
240 default:
241 break;
242 }
243 } else if (r->read == 2) {
244 if (*z == '+')
245 p = z + 1;
246
247 switch (barcode_type) {
248 case null_index:
249 case index_index:
250 case inline_index:
251 strncpy(r->index_bc, p, max_bc_size_2);
252 r->index_bc[max_bc_size_2] = '\0';
253 break;
254 default:
255 break;
256 }
257 }
258 }
259
260 } else if (colon_cnt == 4 && hash_cnt == 1) {
261 r->fastq_type = illv1_fastq;
262
263 for (p = s->id, q = p; *q != ':' && q < stop; q++);
264 if (q < stop) {
265 *q = '\0';
266 strcpy(r->machine, p);
267 *q = ':';
268 }
269
270 for (p = q+1, q = p; *q != ':' && q < stop; q++);
271 if (q < stop) {
272 *q = '\0';
273 r->lane = atoi(p);
274 *q = ':';
275 }
276
277 for (p = q+1, q = p; *q != ':' && q < stop; q++);
278 if (q < stop) {
279 *q = '\0';
280 r->tile = atoi(p);
281 *q = ':';
282 }
283
284 for (p = q+1, q = p; *q != ':' && q < stop; q++);
285 if (q < stop) {
286 *q = '\0';
287 r->x = atoi(p);
288 *q = ':';
289 }
290
291 for (p = q+1, q = p; *q != '#' && q < stop; q++);
292 if (q < stop) {
293 *q = '\0';
294 r->y = atoi(p);
295 *q = '#';
296 }
297
298 for (p = q+1, q = p; *q != '/' && q < stop; q++);
299 if (q < stop) {
300 *q = '\0';
301 r->index = atoi(p);
302 *q = '/';
303 }
304
305 for (p = q+1, q = p; *q != '\0' && q < stop; q++);
306 // r->read = atoi(p);
307
308 } else {
309 r->fastq_type = generic_fastq;
310
311 strncpy(r->machine, s->id, id_len);
312 r->machine[id_len] = '\0';
313 }
314
315 uint len = strlen(s->seq);
316
317 //
318 // Resize the sequence/phred buffers if necessary.
319 //
320 if (len > r->size - 1)
321 r->resize(len + 1);
322
323 strncpy(r->seq, s->seq, r->size - 1);
324 strncpy(r->phred, s->qual, r->size - 1);
325 r->seq[r->size - 1] = '\0';
326 r->phred[r->size - 1] = '\0';
327 r->len = len;
328
329 if (r->read == 1) {
330 switch (barcode_type) {
331 case inline_null:
332 case inline_inline:
333 case inline_index:
334 strncpy(r->inline_bc, r->seq, max_bc_size_1);
335 r->inline_bc[max_bc_size_1] = '\0';
336 break;
337 case index_inline:
338 strncpy(r->inline_bc, r->seq, max_bc_size_2);
339 r->inline_bc[max_bc_size_2] = '\0';
340 break;
341 default:
342 break;
343 }
344 } else if (r->read == 2) {
345 switch (barcode_type) {
346 case null_inline:
347 case inline_inline:
348 case index_inline:
349 strncpy(r->inline_bc, r->seq, max_bc_size_2);
350 r->inline_bc[max_bc_size_2] = '\0';
351 break;
352 default:
353 break;
354 }
355 }
356
357 r->retain = 1;
358
359 return 0;
360 }
361
362 int
rev_complement(char * seq,int offset,bool overhang)363 rev_complement(char *seq, int offset, bool overhang)
364 {
365 char *p, *q;
366
367 offset += overhang ? 1 : 0;
368 q = seq + offset;
369
370 int len = strlen(q);
371 int j = 0;
372 char *com = new char[len + 1];
373
374 for (p = q + len - 1; p >= q; p--) {
375 switch (*p) {
376 case 'A':
377 case 'a':
378 com[j] = 'T';
379 break;
380 case 'C':
381 case 'c':
382 com[j] = 'G';
383 break;
384 case 'G':
385 case 'g':
386 com[j] = 'C';
387 break;
388 case 'T':
389 case 't':
390 com[j] = 'A';
391 break;
392 }
393 j++;
394 }
395 com[len] = '\0';
396
397 for (j = 0; j < len; j++)
398 q[j] = com[j];
399
400 delete [] com;
401
402 return 0;
403 }
404
405 int
reverse_qual(char * qual,int offset,bool overhang)406 reverse_qual(char *qual, int offset, bool overhang)
407 {
408 char *p, *q;
409
410 offset += overhang ? 1 : 0;
411 q = qual + offset;
412
413 int len = strlen(q);
414 int j = 0;
415 char *com = new char[len + 1];
416
417 for (p = q + len - 1; p >= q; p--) {
418 com[j] = *p;
419 j++;
420 }
421 com[len] = '\0';
422
423 for (j = 0; j < len; j++)
424 q[j] = com[j];
425
426 delete [] com;
427
428 return 0;
429 }
430
431 //
432 // Functions for quality filtering based on phred scores.
433 //
434 int
check_quality_scores(RawRead * href,int qual_offset,int score_limit,int len_limit,int offset)435 check_quality_scores(RawRead *href, int qual_offset, int score_limit, int len_limit, int offset)
436 {
437 //
438 // Phred quality scores are discussed here:
439 // http://en.wikipedia.org/wiki/FASTQ_format
440 //
441 // Illumina 1.3+ encodes phred scores between ASCII values 64 (0 quality) and 104 (40 quality)
442 //
443 // @ABCDEFGHIJKLMNOPQRSTUVWXYZ[\]^_`abcdefgh
444 // | | | | |
445 // 64 74 84 94 104
446 // 0 10(90%) 20(99%) 30(99.9%) 40(99.99%)
447 //
448 //
449 // Drop sequence if the average phred quality score drops below a threshold within a sliding window.
450 //
451
452 //
453 // Convert the encoded quality scores to their integer values
454 //
455 // cerr << "Integer scores: ";
456 for (uint j = 0; j < href->len; j++) {
457 href->int_scores[j] = href->phred[j] - qual_offset;
458 // cerr << href->int_scores[j] << " ";
459 }
460 // cerr << "\n";
461
462 double mean = 0.0;
463 double working_sum = 0.0;
464 int *p, *q, j;
465
466 // cerr << "Window length: " << href->win_len << "; Stop position: " << href->stop_pos << "\n";
467 //
468 // Populate the sliding window.
469 //
470 for (j = offset; j < href->win_len + offset; j++)
471 working_sum += href->int_scores[j];
472
473 // cerr << "Populating the sliding window using position " << offset << " to " << href->win_len + offset - 1 << "; initial working sum: " << working_sum << "\n";
474
475 //
476 // Set p pointer to the first element in the window, and q to one element past the last element in the window.
477 //
478 p = href->int_scores + offset;
479 q = p + (int) href->win_len;
480 j = offset;
481
482 // cerr << "Setting pointers; P: " << (href->int_scores + offset) - href->int_scores << "; Q: " << p + (int) href->win_len - p << "; J: " << j << "\n";
483
484 do {
485 mean = working_sum / href->win_len;
486
487 // cerr << "J: " << j << "; Window contents: ";
488 // for (int *r = p; r < q; r++)
489 // cerr << *r << " ";
490 // cerr << "\n";
491 // cerr << " Mean: " << mean << "\n";
492
493 if (mean < score_limit) {
494
495 if (j < len_limit) {
496 return 0;
497 } else {
498 href->len = j + 1;
499 href->seq[j] = '\0';
500 href->phred[j] = '\0';
501 return -1;
502 }
503 }
504
505 //
506 // Advance the window:
507 // Add the score from the front edge of the window, subtract the score
508 // from the back edge of the window.
509 //
510 working_sum -= (double) *p;
511 working_sum += (double) *q;
512
513 // cerr << " Removing value of p: " << *p << " (position: " << p - (href->int_scores) << ")\n";
514 // cerr << " Adding value of q: " << *q << " (position: " << q - (href->int_scores) << ")\n";
515
516 p++;
517 q++;
518 j++;
519 } while (j <= href->stop_pos);
520
521 return 1;
522 }
523
524 bool
correct_barcode(set<string> & bcs,RawRead * href,seqt type,int num_errs)525 correct_barcode(set<string> &bcs, RawRead *href, seqt type, int num_errs)
526 {
527 if (recover == false)
528 return false;
529
530 //
531 // The barcode_dist variable specifies how far apart in sequence space barcodes are. If barcodes
532 // are off by two nucleotides in sequence space, than we can correct barcodes that have a single
533 // sequencing error.
534 //
535 // If the barcode sequence is off by no more than barcodes_dist-1 nucleotides, correct it. We will
536 // search the whole possible space of barcodes if more than one length of barcode was specified.
537 //
538 const char *p; char *q;
539 char bc[id_len];
540 int d, close;
541 string b;
542 set<string>::iterator it;
543
544 close = 0;
545
546 for (it = bcs.begin(); it != bcs.end(); it++) {
547
548 //
549 // Copy the proper subset of the barcode to match the length of the barcode in the bcs set.
550 //
551 strncpy(bc, type == single_end ? href->se_bc : href->pe_bc, it->length());
552 bc[it->length()] = '\0';
553
554 d = 0;
555 for (p = it->c_str(), q = bc; *p != '\0'; p++, q++)
556 if (*p != *q) d++;
557
558 if (d <= num_errs) {
559 close++;
560 b = *it;
561 break;
562 }
563 }
564
565 if (close == 1) {
566 //
567 // Correct the barcode.
568 //
569 if (type == single_end) {
570 strcpy(href->se_bc, b.c_str());
571 if (barcode_type == inline_null ||
572 barcode_type == inline_index ||
573 barcode_type == inline_inline)
574 href->inline_bc_len = b.length();
575 } else {
576 strcpy(href->pe_bc, b.c_str());
577 if (barcode_type == index_inline ||
578 barcode_type == inline_inline)
579 href->inline_bc_len = b.length();
580 }
581
582 return true;
583 }
584
585 return false;
586 }
587
588 //
589 // Functions for filtering adapter sequence
590 //
591 int
init_adapter_seq(int kmer_size,char * adapter,int & adp_len,AdapterHash & kmers)592 init_adapter_seq(int kmer_size, char *adapter, int &adp_len, AdapterHash &kmers)
593 {
594 string kmer;
595 adp_len = strlen(adapter);
596
597 int num_kmers = adp_len - kmer_size + 1;
598 char *p = adapter;
599 for (int i = 0; i < num_kmers; i++) {
600 kmer.assign(p, kmer_size);
601 kmers[kmer].push_back(i);
602 p++;
603 }
604
605 return 0;
606 }
607
608 int
filter_adapter_seq(RawRead * href,char * adapter,int adp_len,AdapterHash & adp_kmers,int kmer_size,int distance,int len_limit)609 filter_adapter_seq(RawRead *href, char *adapter, int adp_len, AdapterHash &adp_kmers,
610 int kmer_size, int distance, int len_limit)
611 {
612 vector<pair<int, int> > hits;
613 int num_kmers = href->len - kmer_size + 1;
614 const char *p = href->seq;
615 string kmer;
616
617 //
618 // Identify matching kmers and their locations of occurance.
619 //
620 for (int i = 0; i < num_kmers; i++) {
621 kmer.assign(p, kmer_size);
622
623 if (adp_kmers.count(kmer) > 0) {
624 for (uint j = 0; j < adp_kmers[kmer].size(); j++) {
625 // cerr << "Kmer hit " << kmer << " at query position " << i << " at hit position " << adp_kmers[kmer][j] << "\n";
626 hits.push_back(make_pair(i, adp_kmers[kmer][j]));
627 }
628 }
629 p++;
630 }
631
632 //
633 // Scan backwards from the position of the k-mer and then scan forwards
634 // counting the number of mismatches.
635 //
636 int mismatches, i, j, start_pos;
637
638 for (uint k = 0; k < hits.size(); k++) {
639 mismatches = 0;
640 i = hits[k].first; // Position in query sequence
641 j = hits[k].second; // Position in adapter hit
642
643 // cerr << "Starting comparison at i: "<< i << "; j: " << j << "\n";
644
645 while (i >= 0 && j >= 0) {
646 if (href->seq[i] != adapter[j])
647 mismatches++;
648 i--;
649 j--;
650 }
651
652 if (mismatches > distance)
653 continue;
654
655 start_pos = i + 1;
656 i = hits[k].first;
657 j = hits[k].second;
658
659 while (i < (int) href->len && j < adp_len && mismatches <= distance) {
660 if (href->seq[i] != adapter[j])
661 mismatches++;
662 i++;
663 j++;
664 }
665
666 // cerr << "Starting position: " << start_pos << "; Query end (i): " << i << "; adapter end (j): " << j
667 // << "; number of mismatches: " << mismatches << "; Seq Len: " << href->len << "; SeqSeq Len: " << strlen(href->seq) << "\n";
668
669 if (mismatches <= distance && (i == (int) href->len || j == adp_len)) {
670 // cerr << " Trimming or dropping.\n";
671 if (start_pos < len_limit) {
672 return 0;
673 } else {
674 href->len = start_pos + 1;
675 href->seq[start_pos] = '\0';
676 href->phred[start_pos] = '\0';
677 return -1;
678 }
679 }
680 }
681
682 return 1;
683 }
684