1 /* @source trimseq application
2 **
3 ** Trim ambiguous bits off the ends of sequences
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
21 ******************************************************************************/
22 
23 #include "emboss.h"
24 #include <ctype.h>		/* for tolower, toupper */
25 
26 #define TRIMCHARSET 256		/* size of character set */
27 
28 
29 
30 
31 static ajint trimseq_trim(const AjPSeq seq, ajint sense, AjBool isnuc,
32 			  ajint window, float percent, AjBool strict,
33 			  AjBool star);
34 static void trimseq_parole(AjBool *gang, const char *good_guys);
35 
36 
37 
38 
39 /* @prog trimseq **************************************************************
40 **
41 ** Trim ambiguous bits off the ends of sequences
42 **
43 ******************************************************************************/
44 
main(int argc,char ** argv)45 int main(int argc, char **argv)
46 {
47     AjPSeqall seqall;
48     AjPSeqout seqout;
49     AjPSeq seq = NULL;
50     ajint window;
51     AjBool left;
52     AjBool right;
53     AjBool strict;
54     AjBool star;
55     float percent;
56     AjBool isnuc;
57     AjPStr str = NULL;
58     ajint start;
59     ajint end;
60 
61 
62     embInit("trimseq", argc, argv);
63 
64     seqall  = ajAcdGetSeqall("sequence");
65     seqout  = ajAcdGetSeqoutall("outseq");
66     window  = ajAcdGetInt("window");
67     percent = ajAcdGetFloat("percent");
68     left    = ajAcdGetBoolean("left");
69     right   = ajAcdGetBoolean("right");
70     strict  = ajAcdGetBoolean("strict");
71     star    = ajAcdGetBoolean("star");
72 
73     str = ajStrNew();
74 
75     while(ajSeqallNext(seqall, &seq))
76     {
77 	/* is this a protein or nucleic sequence? */
78 	isnuc = ajSeqIsNuc(seq);
79 
80 	/* find the left start */
81 	if(left)
82 	    start = trimseq_trim(seq, 1, isnuc, window, percent, strict, star)
83 		+ 1;
84 	else
85 	    start = 0;
86 
87 	/* find the right end */
88 	if(right)
89 	    end = trimseq_trim(seq, 0, isnuc, window, percent, strict, star)
90 		- 1;
91 	else
92 	    end = ajSeqGetLen(seq)-1;
93 
94 	/* get a COPY of the sequence string */
95 	ajStrAssignS(&str, ajSeqGetSeqS(seq));
96 
97 	ajStrKeepRange(&str, start, end);
98 	ajSeqAssignSeqS(seq, str);
99 	ajSeqoutWriteSeq(seqout, seq);
100     }
101 
102     ajSeqoutClose(seqout);
103 
104     ajStrDel(&str);
105     ajSeqallDel(&seqall);
106     ajSeqDel(&seq);
107     ajSeqoutDel(&seqout);
108 
109     embExit();
110 
111     return 0;
112 }
113 
114 
115 
116 
117 /* @funcstatic trimseq_trim ***************************************************
118 **
119 ** Trim sequence
120 **
121 ** @param [r] seq [const AjPSeq] sequence
122 ** @param [r] sense [ajint] 0=right trim    1=left trim
123 ** @param [r] isnuc [AjBool] whether nucleic or protein
124 ** @param [r] window [ajint] window size
125 ** @param [r] percent [float] % ambiguity in window
126 ** @param [r] strict [AjBool] trim off all IUPAC ambiguity codes, not just X, N
127 ** @param [r] star [AjBool] trim off asterisks in proteins
128 ** @return [ajint] position to trim to or -1
129 **                 or ajSeqGetLen(seq) if no bad characters were found
130 ** @@
131 ******************************************************************************/
132 
trimseq_trim(const AjPSeq seq,ajint sense,AjBool isnuc,ajint window,float percent,AjBool strict,AjBool star)133 static ajint trimseq_trim(const AjPSeq seq,
134 			  ajint sense, AjBool isnuc, ajint window,
135 			  float percent, AjBool strict, AjBool star)
136 {
137     ajint leroy_brown;		/* last bad character */
138     ajint suspect;		/* possible last bad character */
139     AjBool gang[TRIMCHARSET];	/*
140 				 * all the characters - true if a bad one to
141 				 * be removed
142 				 */
143     ajint i;
144     ajint a;		/* start position of window */
145     ajint z;		/* position to end moving window to */
146     ajint inc;		/* increment value to move window (+-1) */
147     ajint count;	/* count of bad characters in window */
148     ajint look;		/* position in wind we are looking at */
149     float pc;		/* percentage of bad characters in this window */
150     char c;
151 
152     /* set the characters to trim */
153     for(i=0; i<TRIMCHARSET; i++) 	/* set them all to be bad initially */
154 	gang[i] = ajTrue;
155 
156     if(isnuc)
157     {
158 	/* normal bases and gap characters are good guys */
159 	trimseq_parole(gang, "acgtu.-~ ");
160 	if(!strict)
161 	    /* so are ambiguity codes if we are not strict */
162 	    trimseq_parole(gang, "mrwsykvhdb");
163     }
164     else
165     {
166 	/*
167 	** protein
168 	** normal residues and gap characters are good guys
169 	*/
170 	trimseq_parole(gang, "arndcqeghilkmfpstwyv.-~ ");
171 	if(!strict)
172 	    /* so are ambiguity codes if not strict */
173 	    trimseq_parole(gang, "bz");
174 
175 	if(star)
176 	    /* so is an asterisk if needed */
177 	    trimseq_parole(gang, "*");
178     }
179 
180     /* start loop - see direction */
181     if(sense)
182     {
183 	a = 0;
184 	z = ajSeqGetLen(seq) - window;
185 	inc = 1;
186 	leroy_brown = -1;
187 	suspect = -1;
188     }
189     else
190     {
191 	a = ajSeqGetLen(seq)-1;
192 	z = window;
193 	inc = -1;
194 	leroy_brown = ajSeqGetLen(seq);
195 	suspect = ajSeqGetLen(seq);
196     }
197 
198     /*
199     **   do an initial trim of contiguous runs of bad characters from the ends
200     **   Always trim gaps from the end
201     **/
202     for(; a != z; a += inc)
203     {
204 	c = (ajSeqGetSeqC(seq))[a];
205 	if(gang[(ajint)c] || c == '.' || c == '-' || c == '~' || c == ' ')
206 	    /* trim if bad character or a gap character at the end */
207 	    leroy_brown = a;		/* want to trim down to here */
208 	else
209 	    break;
210     }
211 
212     /* do the window trim of the remainder of the sequence */
213     for(; a != z; a += inc)
214     {
215 	/* look in the window */
216 	for(count = 0, look = 0; look < window && look > -window; look += inc)
217 	{
218 	    c = (ajSeqGetSeqC(seq))[a+look];
219 	    if(gang[(ajint)c])
220 	    {
221 		/* count the bad characters */
222 		count++;
223 		/* remember the last bad character position in this window */
224 		suspect = a+look;
225 	    }
226 	}
227 
228 	/* what is the percentage of bad characters in this window */
229 	pc = (float)100.0 * (float)count/(float)window;
230 	/* Need to trim this window? */
231 	if(pc < percent)
232 	    break;
233 
234 	if(sense)
235 	{
236 	    if(suspect > leroy_brown)
237 		leroy_brown = suspect;
238 	}
239 	else
240 	    if(suspect < leroy_brown)
241 		leroy_brown = suspect;
242     }
243 
244     /*
245     **  do a final tidy up of gap characters left at the new end of the
246     **  sequence
247     **  Always trim gaps from the end
248     */
249     for(a = leroy_brown+inc; a != z; a += inc)
250     {
251 	c = (ajSeqGetSeqC(seq))[a];
252 	if(c == '.' || c == '-' || c == '~' || c == ' ')
253 	    /* trim if we have a gap character at the end */
254 	    leroy_brown = a;		/* want to trim down to here */
255 	else
256 	    break;
257     }
258 
259     return leroy_brown;
260 }
261 
262 
263 
264 
265 /* @funcstatic trimseq_parole *************************************************
266 **
267 **  sets the upper and lowercase characters in the array 'gang' to be ajFalse
268 **
269 ** @param [w] gang [AjBool*] array of flags for whether a character
270 **                           is required or not
271 ** @param [r] good_guys [const char*] string of chars that are required
272 ** @@
273 ******************************************************************************/
274 
trimseq_parole(AjBool * gang,const char * good_guys)275 static void trimseq_parole(AjBool *gang, const char *good_guys)
276 {
277     ajint i;
278 
279     for(i=0; good_guys[i]; i++)
280     {
281 	gang[tolower((ajint) good_guys[i])] = ajFalse;
282 	gang[toupper((ajint) good_guys[i])] = ajFalse;
283     }
284 
285     return;
286 }
287