1 /*
2     FASTX-toolkit - FASTA/FASTQ preprocessing tools.
3     Copyright (C) 2009-2013  A. Gordon (assafgordon@gmail.com)
4 
5     This program is free software: you can redistribute it and/or modify
6     it under the terms of the GNU Affero General Public License as
7     published by the Free Software Foundation, either version 3 of the
8     License, or (at your option) any later version.
9 
10     This program is distributed in the hope that it will be useful,
11     but WITHOUT ANY WARRANTY; without even the implied warranty of
12     MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13     GNU Affero General Public License for more details.
14 
15     You should have received a copy of the GNU Affero General Public License
16     along with this program.  If not, see <http://www.gnu.org/licenses/>.
17 */
18 #include <cstddef>
19 #include <cstdlib>
20 #include <algorithm>
21 #include <ostream>
22 #include <iostream>
23 #include <string>
24 #include <vector>
25 #include <string.h>
26 #include <stdio.h>
27 #include <unistd.h>
28 
29 #include "sequence_alignment.h"
30 
31 #include <errno.h>
32 #include <err.h>
33 
34 #include <config.h>
35 
36 #include "fastx.h"
37 #include "fastx_args.h"
38 
39 
40 #define MAX_ADAPTER_LEN 100
41 
42 const char* usage=
43 "usage: fastx_clipper [-h] [-a ADAPTER] [-D] [-l N] [-n] [-d N] [-c] [-C] [-o] [-v] [-z] [-i INFILE] [-o OUTFILE]\n" \
44 "Part of " PACKAGE_STRING " by A. Gordon (assafgordon@gmail.com)\n" \
45 "\n" \
46 "   [-h]         = This helpful help screen.\n" \
47 "   [-a ADAPTER] = ADAPTER string. default is CCTTAAGG (dummy adapter).\n" \
48 "   [-l N]       = discard sequences shorter than N nucleotides. default is 5.\n" \
49 "   [-d N]       = Keep the adapter and N bases after it.\n" \
50 "                  (using '-d 0' is the same as not using '-d' at all. which is the default).\n" \
51 "   [-c]         = Discard non-clipped sequences (i.e. - keep only sequences which contained the adapter).\n" \
52 "   [-C]         = Discard clipped sequences (i.e. - keep only sequences which did not contained the adapter).\n" \
53 "   [-k]         = Report Adapter-Only sequences.\n" \
54 "   [-n]         = keep sequences with unknown (N) nucleotides. default is to discard such sequences.\n" \
55 "   [-v]         = Verbose - report number of sequences.\n" \
56 "                  If [-o] is specified,  report will be printed to STDOUT.\n" \
57 "                  If [-o] is not specified (and output goes to STDOUT),\n" \
58 "                  report will be printed to STDERR.\n" \
59 "   [-z]         = Compress output with GZIP.\n" \
60 "   [-D]	 = DEBUG output.\n" \
61 "   [-M N]       = require minimum adapter alignment length of N.\n" \
62 "                  If less than N nucleotides aligned with the adapter - don't clip it." \
63 "   [-i INFILE]  = FASTA/Q input file. default is STDIN.\n" \
64 "   [-o OUTFILE] = FASTA/Q output file. default is STDOUT.\n" \
65 "\n";
66 
67 //Default adapter - Dummy sequence
68 char adapter[MAX_ADAPTER_LEN]="CCTTAAGG";
69 unsigned int min_length=5;
70 int discard_unknown_bases=1;
71 int keep_delta=0;
72 int discard_non_clipped=0;
73 int discard_clipped=0;
74 int show_adapter_only=0;
75 int debug = 0 ;
76 int minimum_adapter_length = 0;
77 
78 
79 //Statistics for verbose report
80 unsigned int count_input=0 ;
81 unsigned int count_discarded_too_short=0; // see [-l N] option
82 unsigned int count_discarded_adapter_at_index_zero=0;  //empty sequences (after clipping)
83 unsigned int count_discarded_no_adapter_found=0; // see [-c] option
84 unsigned int count_discarded_adapter_found=0; // see [-C] option
85 unsigned int count_discarded_N=0; // see [-n]
86 
87 FASTX fastx;
88 HalfLocalSequenceAlignment align;
89 
parse_program_args(int optind,int optc,char * optarg)90 int parse_program_args(int __attribute__((unused)) optind, int optc, char* optarg)
91 {
92 	switch(optc) {
93 		case 'M':
94 			if (optarg==NULL)
95 				errx(1, "[-M] parameter requires an argument value");
96 			minimum_adapter_length = atoi(optarg);
97 			if (minimum_adapter_length<=0)
98 				errx(1,"Invalid minimum adapter length (-M %s)", optarg);
99 			break;
100 
101 		case 'k':
102 			show_adapter_only=1;
103 			break;
104 
105 		case 'D':
106 			debug++;
107 			break ;
108 
109 		case 'c':
110 			discard_non_clipped = 1;
111 			break;
112 
113 		case 'C':
114 			discard_clipped = 1 ;
115 			break ;
116 		case 'd':
117 			if (optarg==NULL)
118 				errx(1, "[-d] parameter requires an argument value");
119 			keep_delta = strtoul(optarg,NULL,10);
120 			if (keep_delta<0)
121 				errx(1,"Invalid number bases to keep (-d %s)", optarg);
122 			break;
123 		case 'a':
124 			strncpy(adapter,optarg,sizeof(adapter)-1);
125 			//TODO:
126 			//if (!valid_sequence_string(adapter))
127 			//	errx(1,"Invalid adapter string (-a %s)", adapter);
128 			break ;
129 
130 		case 'l':
131 			if (optarg==NULL)
132 				errx(1,"[-l] parameter requires an argument value");
133 
134 			min_length = strtoul(optarg, NULL, 10);
135 			break;
136 
137 		case 'n':
138 			discard_unknown_bases = 0 ;
139 			break;
140 
141 		default:
142 			errx(1,"Unknown argument (%c)", optc ) ;
143 
144 	}
145 	return 1;
146 }
147 
parse_commandline(int argc,char * argv[])148 int parse_commandline(int argc, char* argv[])
149 {
150 
151 	fastx_parse_cmdline(argc, argv, "M:kDCcd:a:s:l:n", parse_program_args);
152 
153 	if (keep_delta>0)
154 		keep_delta += strlen(adapter);
155 	return 1;
156 }
157 
158 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results ) __attribute__ ((const));
adapter_cutoff_index(const SequenceAlignmentResults & alignment_results)159 int adapter_cutoff_index ( const SequenceAlignmentResults& alignment_results )
160 {
161 	#if 0
162 	int mismatches = alignment_results.mismatches ;
163 
164 	//The adapter(=target) is expected to align from the first base.
165 	//If the start is not zero (=not aligned from first base),
166 	//count each skipped base as a mismatch
167 	mismatches += alignment_results.target_start ;
168 
169 	//The adapter is expected to align up to the end
170 	//of the adapter(=target), or the end of the query.
171 	//If it doesn't, count the un-aligned bases as mismatches
172 	int missing_from_query_end = (alignment_results.query_size - alignment_results.query_end-1);
173 	int missing_from_target_end = (alignment_results.target_size - alignment_results.target_end-1);
174 
175 	int missing_from_end = std::min(missing_from_query_end, missing_from_target_end);
176 
177 	mismatches += missing_from_end ;
178 
179 
180 
181 	std::cout << "Missing from start = " << alignment_results.target_start
182 		  << " Missing from end = " << missing_from_end
183 		  << " mismatches = " << mismatches
184 		  << std::endl;
185 
186 	if (mismatches > max_mismatches)
187 		return -1;
188 
189 	return alignment_results.query_start;
190 	#endif
191 
192 	int alignment_size = alignment_results.neutral_matches +
193 			     alignment_results.matches +
194 			     alignment_results.mismatches +
195 			     alignment_results.gaps ;
196 
197 	//No alignment at all?
198 	if (alignment_size==0)
199 		return -1;
200 
201 	if (minimum_adapter_length>0 && alignment_size<minimum_adapter_length)
202 		return -1;
203 
204 	//Any good alignment at the end of the query
205 	//(even only a single nucleotide)
206 	//Example:
207 	//  The adapter starts with CTGTAG, The Query ends with CT - it's a match.
208 	if ( alignment_results.query_end == alignment_results.query_size-1
209 	     &&
210 	     alignment_results.mismatches == 0 ) {
211 	     	//printf("--1\n");
212 		return alignment_results.query_start ;
213 	}
214 
215 	if ( alignment_size > 5
216 	     &&
217 	     alignment_results.target_start == 0
218 	     &&
219 	     (alignment_results.matches * 100 / alignment_size ) >= 75 ) {
220 	     	//printf("--2\n");
221 		return alignment_results.query_start ;
222 	}
223 
224 	if ( alignment_size > 11
225 	     &&
226 	     (alignment_results.matches * 100 / alignment_size ) >= 80 ) {
227 	     	//printf("--2\n");
228 		return alignment_results.query_start ;
229 	}
230 
231 	//
232 	//Be very lenient regarding alignments at the end of the query sequence
233 	if ( alignment_results.query_end >= alignment_results.query_size-2
234 	     &&
235 	     alignment_size <= 5 && alignment_results.matches >= 3) {
236 			//printf("--3\n");
237 			return alignment_results.query_start ;
238 		}
239 
240 	return -1;
241 }
242 
243 
main(int argc,char * argv[])244 int main(int argc, char* argv[])
245 {
246 	int i;
247 	int reads_count;
248 
249 	parse_commandline(argc, argv);
250 
251 	fastx_init_reader(&fastx, get_input_filename(),
252 		FASTA_OR_FASTQ, ALLOW_N, REQUIRE_UPPERCASE,
253 		get_fastq_ascii_quality_offset() );
254 
255 	fastx_init_writer(&fastx, get_output_filename(), OUTPUT_SAME_AS_INPUT, compress_output_flag());
256 
257 	while ( fastx_read_next_record(&fastx) ) {
258 
259 		reads_count = get_reads_count(&fastx);
260 
261 		#if 0
262 		std::string query = std::string(fastx.nucleotides) + std::string( strlen(adapter), 'N' );
263 		std::string target= std::string( strlen(fastx.nucleotides), 'N' ) + std::string(adapter);
264 		#else
265 		std::string query = std::string(fastx.nucleotides) ;
266 		std::string target= std::string(adapter);
267 		#endif
268 
269 
270 		align.align( query, target ) ;
271 
272 		if (debug>1)
273 			align.print_matrix();
274 		if (debug>0)
275 			align.results().print();
276 
277 		count_input+= reads_count;
278 
279 		//Find the best match with the adapter
280 		i = adapter_cutoff_index ( align.results() ) ;
281 
282 		if (i!=-1 && i>0) {
283 			i += keep_delta;
284 			//Just trim the string after this position
285 			fastx.nucleotides[i] = 0 ;
286 		}
287 
288 		if (i==0) { // empty sequence ? (in which the adapter was found at index 0)
289 			count_discarded_adapter_at_index_zero += reads_count;
290 
291 			if (show_adapter_only)
292 				fastx_write_record(&fastx);
293 			continue;
294 		}
295 
296 		if (strlen(fastx.nucleotides) < min_length) { // too-short sequence ?
297 			count_discarded_too_short += reads_count;
298 			continue;
299 		}
300 
301 		if ( (i==-1) && discard_non_clipped ) { // adapter not found (i.e. sequence was not clipped) ?
302 			count_discarded_no_adapter_found += reads_count;
303 			continue ;
304 		}
305 
306 		if ( (i>0) && discard_clipped ) { // adapter found, and user requested to keep only non-clipped sequences
307 			count_discarded_adapter_found += reads_count;
308 			continue;
309 		}
310 
311 		if ( (discard_unknown_bases && strchr(fastx.nucleotides,'N')!=NULL ) ) { // contains unknown bases (after clipping) ?
312 			count_discarded_N += reads_count;
313 			continue;
314 		}
315 
316 		if (!show_adapter_only)  {
317 			//none of the above condition matched, so print this sequence.
318 			fastx_write_record(&fastx);
319 		}
320 	}
321 
322 	//
323 	//Print verbose report
324 	if ( verbose_flag() ) {
325 		fprintf(get_report_file(), "Clipping Adapter: %s\n", adapter );
326 		fprintf(get_report_file(), "Min. Length: %d\n", min_length) ;
327 
328 		if (discard_clipped)
329 			fprintf(get_report_file(), "Clipped reads - discarded.\n"  ) ;
330 		if (discard_non_clipped)
331 			fprintf(get_report_file(), "Non-Clipped reads - discarded.\n"  ) ;
332 
333 
334 		fprintf(get_report_file(), "Input: %u reads.\n", count_input ) ;
335 		fprintf(get_report_file(), "Output: %u reads.\n",
336 			count_input - count_discarded_too_short - count_discarded_no_adapter_found - count_discarded_adapter_found -
337 			count_discarded_N - count_discarded_adapter_at_index_zero ) ;
338 
339 		fprintf(get_report_file(), "discarded %u too-short reads.\n", count_discarded_too_short ) ;
340 		fprintf(get_report_file(), "discarded %u adapter-only reads.\n", count_discarded_adapter_at_index_zero );
341 		if (discard_non_clipped)
342 			fprintf(get_report_file(), "discarded %u non-clipped reads.\n", count_discarded_no_adapter_found );
343 		if (discard_clipped)
344 			fprintf(get_report_file(), "discarded %u clipped reads.\n", count_discarded_adapter_found );
345 		if (discard_unknown_bases)
346 			fprintf(get_report_file(), "discarded %u N reads.\n", count_discarded_N );
347 	}
348 
349 	return 0;
350 }
351