1 /*
2 FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3 Copyright (C) 2009-2013 A. Gordon (assafgordon@gmail.com)
4
5 This program is free software: you can redistribute it and/or modify
6 it under the terms of the GNU Affero General Public License as
7 published by the Free Software Foundation, either version 3 of the
8 License, or (at your option) any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU Affero General Public License for more details.
14
15 You should have received a copy of the GNU Affero General Public License
16 along with this program. If not, see <http://www.gnu.org/licenses/>.
17 */
18 #include <cstddef>
19 #include <cstdlib>
20 #include <algorithm>
21 #include <ostream>
22 #include <iostream>
23 #include <string>
24 #include <vector>
25 #include <string.h>
26 #include <stdio.h>
27 #include <unistd.h>
28
29 #include "sequence_alignment.h"
30
31 #include <errno.h>
32 #include <err.h>
33
34 #include <config.h>
35
36 #include "fastx.h"
37 #include "fastx_args.h"
38
39
40 #define MAX_ADAPTER_LEN 100
41
42 const char* usage=
43 "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
44 "Part of " PACKAGE_STRING " by A. Gordon (assafgordon@gmail.com)\n" \
45 "\n" \
46 " [-h] = This helpful help screen.\n" \
47 " [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \
48 " [-l N] = discard sequences shorter than N nucleotides. default is 5.\n" \
49 " [-d N] = Keep the adapter and N bases after it.\n" \
50 " (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \
51 " [-c] = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \
52 " [-C] = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \
53 " [-k] = Report Adapter-Only sequences.\n" \
54 " [-n] = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \
55 " [-v] = Verbose - report number of sequences.\n" \
56 " If [-o] is specified, report will be printed to STDOUT.\n" \
57 " If [-o] is not specified (and output goes to STDOUT),\n" \
58 " report will be printed to STDERR.\n" \
59 " [-z] = Compress output with GZIP.\n" \
60 " [-D] = DEBUG output.\n" \
61 " [-M N] = require minimum adapter alignment length of N.\n" \
62 " If less than N nucleotides aligned with the adapter - don't clip it." \
63 " [-i INFILE] = FASTA/Q input file. default is STDIN.\n" \
64 " [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
65 "\n";
66
67 //Default adapter - Dummy sequence
68 char adapter[MAX_ADAPTER_LEN]="CCTTAAGG";
69 unsigned int min_length=5;
70 int discard_unknown_bases=1;
71 int keep_delta=0;
72 int discard_non_clipped=0;
73 int discard_clipped=0;
74 int show_adapter_only=0;
75 int debug = 0 ;
76 int minimum_adapter_length = 0;
77
78
79 //Statistics for verbose report
80 unsigned int count_input=0 ;
81 unsigned int count_discarded_too_short=0; // see [-l N] option
82 unsigned int count_discarded_adapter_at_index_zero=0; //empty sequences (after clipping)
83 unsigned int count_discarded_no_adapter_found=0; // see [-c] option
84 unsigned int count_discarded_adapter_found=0; // see [-C] option
85 unsigned int count_discarded_N=0; // see [-n]
86
87 FASTX fastx;
88 HalfLocalSequenceAlignment align;
89
parse_program_args(int optind,int optc,char * optarg)90 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
91 {
92 switch(optc) {
93 case 'M':
94 if (optarg==NULL)
95 errx(1, "[-M] parameter requires an argument value");
96 minimum_adapter_length = atoi(optarg);
97 if (minimum_adapter_length<=0)
98 errx(1,"Invalid minimum adapter length (-M %s)", optarg);
99 break;
100
101 case 'k':
102 show_adapter_only=1;
103 break;
104
105 case 'D':
106 debug++;
107 break ;
108
109 case 'c':
110 discard_non_clipped = 1;
111 break;
112
113 case 'C':
114 discard_clipped = 1 ;
115 break ;
116 case 'd':
117 if (optarg==NULL)
118 errx(1, "[-d] parameter requires an argument value");
119 keep_delta = strtoul(optarg,NULL,10);
120 if (keep_delta<0)
121 errx(1,"Invalid number bases to keep (-d %s)", optarg);
122 break;
123 case 'a':
124 strncpy(adapter,optarg,sizeof(adapter)-1);
125 //TODO:
126 //if (!valid_sequence_string(adapter))
127 // errx(1,"Invalid adapter string (-a %s)", adapter);
128 break ;
129
130 case 'l':
131 if (optarg==NULL)
132 errx(1,"[-l] parameter requires an argument value");
133
134 min_length = strtoul(optarg, NULL, 10);
135 break;
136
137 case 'n':
138 discard_unknown_bases = 0 ;
139 break;
140
141 default:
142 errx(1,"Unknown argument (%c)", optc ) ;
143
144 }
145 return 1;
146 }
147
parse_commandline(int argc,char * argv[])148 int parse_commandline(int argc, char* argv[])
149 {
150
151 fastx_parse_cmdline(argc, argv, "M:kDCcd:a:s:l:n", parse_program_args);
152
153 if (keep_delta>0)
154 keep_delta += strlen(adapter);
155 return 1;
156 }
157
158 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const));
adapter_cutoff_index(const SequenceAlignmentResults & alignment_results)159 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results )
160 {
161 #if 0
162 int mismatches = alignment_results.mismatches ;
163
164 //The adapter(=target) is expected to align from the first base.
165 //If the start is not zero (=not aligned from first base),
166 //count each skipped base as a mismatch
167 mismatches += alignment_results.target_start ;
168
169 //The adapter is expected to align up to the end
170 //of the adapter(=target), or the end of the query.
171 //If it doesn't, count the un-aligned bases as mismatches
172 int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1);
173 int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1);
174
175 int missing_from_end = std::min(missing_from_query_end, missing_from_target_end);
176
177 mismatches += missing_from_end ;
178
179
180
181 std::cout << "Missing from start = " << alignment_results.target_start
182 << " Missing from end = " << missing_from_end
183 << " mismatches = " << mismatches
184 << std::endl;
185
186 if (mismatches > max_mismatches)
187 return -1;
188
189 return alignment_results.query_start;
190 #endif
191
192 int alignment_size = alignment_results.neutral_matches +
193 alignment_results.matches +
194 alignment_results.mismatches +
195 alignment_results.gaps ;
196
197 //No alignment at all?
198 if (alignment_size==0)
199 return -1;
200
201 if (minimum_adapter_length>0 && alignment_size<minimum_adapter_length)
202 return -1;
203
204 //Any good alignment at the end of the query
205 //(even only a single nucleotide)
206 //Example:
207 // The adapter starts with CTGTAG, The Query ends with CT - it's a match.
208 if ( alignment_results.query_end == alignment_results.query_size-1
209 &&
210 alignment_results.mismatches == 0 ) {
211 //printf("--1\n");
212 return alignment_results.query_start ;
213 }
214
215 if ( alignment_size > 5
216 &&
217 alignment_results.target_start == 0
218 &&
219 (alignment_results.matches * 100 / alignment_size ) >= 75 ) {
220 //printf("--2\n");
221 return alignment_results.query_start ;
222 }
223
224 if ( alignment_size > 11
225 &&
226 (alignment_results.matches * 100 / alignment_size ) >= 80 ) {
227 //printf("--2\n");
228 return alignment_results.query_start ;
229 }
230
231 //
232 //Be very lenient regarding alignments at the end of the query sequence
233 if ( alignment_results.query_end >= alignment_results.query_size-2
234 &&
235 alignment_size <= 5 && alignment_results.matches >= 3) {
236 //printf("--3\n");
237 return alignment_results.query_start ;
238 }
239
240 return -1;
241 }
242
243
main(int argc,char * argv[])244 int main(int argc, char* argv[])
245 {
246 int i;
247 int reads_count;
248
249 parse_commandline(argc, argv);
250
251 fastx_init_reader(&fastx, get_input_filename(),
252 FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE,
253 get_fastq_ascii_quality_offset() );
254
255 fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());
256
257 while ( fastx_read_next_record(&fastx) ) {
258
259 reads_count = get_reads_count(&fastx);
260
261 #if 0
262 std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' );
263 std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter);
264 #else
265 std::string query = std::string(fastx.nucleotides) ;
266 std::string target= std::string(adapter);
267 #endif
268
269
270 align.align( query, target ) ;
271
272 if (debug>1)
273 align.print_matrix();
274 if (debug>0)
275 align.results().print();
276
277 count_input+= reads_count;
278
279 //Find the best match with the adapter
280 i = adapter_cutoff_index ( align.results() ) ;
281
282 if (i!=-1 && i>0) {
283 i += keep_delta;
284 //Just trim the string after this position
285 fastx.nucleotides[i] = 0 ;
286 }
287
288 if (i==0) { // empty sequence ? (in which the adapter was found at index 0)
289 count_discarded_adapter_at_index_zero += reads_count;
290
291 if (show_adapter_only)
292 fastx_write_record(&fastx);
293 continue;
294 }
295
296 if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ?
297 count_discarded_too_short += reads_count;
298 continue;
299 }
300
301 if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ?
302 count_discarded_no_adapter_found += reads_count;
303 continue ;
304 }
305
306 if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences
307 count_discarded_adapter_found += reads_count;
308 continue;
309 }
310
311 if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ?
312 count_discarded_N += reads_count;
313 continue;
314 }
315
316 if (!show_adapter_only) {
317 //none of the above condition matched, so print this sequence.
318 fastx_write_record(&fastx);
319 }
320 }
321
322 //
323 //Print verbose report
324 if ( verbose_flag() ) {
325 fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter );
326 fprintf(get_report_file(), "Min. Length: %d\n", min_length) ;
327
328 if (discard_clipped)
329 fprintf(get_report_file(), "Clipped reads - discarded.\n" ) ;
330 if (discard_non_clipped)
331 fprintf(get_report_file(), "Non-Clipped reads - discarded.\n" ) ;
332
333
334 fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ;
335 fprintf(get_report_file(), "Output: %u reads.\n",
336 count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found -
337 count_discarded_N - count_discarded_adapter_at_index_zero ) ;
338
339 fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ;
340 fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero );
341 if (discard_non_clipped)
342 fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found );
343 if (discard_clipped)
344 fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found );
345 if (discard_unknown_bases)
346 fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N );
347 }
348
349 return 0;
350 }
351