1 /* @source trimseq application
2 **
3 ** Trim ambiguous bits off the ends of sequences
4 **
5 ** @author Copyright (C) Gary Williams (gwilliam@hgmp.mrc.ac.uk)
6 ** @@
7 **
8 ** This program is free software; you can redistribute it and/or
9 ** modify it under the terms of the GNU General Public License
10 ** as published by the Free Software Foundation; either version 2
11 ** of the License, or (at your option) any later version.
12 **
13 ** This program is distributed in the hope that it will be useful,
14 ** but WITHOUT ANY WARRANTY; without even the implied warranty of
15 ** MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 ** GNU General Public License for more details.
17 **
18 ** You should have received a copy of the GNU General Public License
19 ** along with this program; if not, write to the Free Software
20 ** Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
21 ******************************************************************************/
22
23 #include "emboss.h"
24 #include <ctype.h> /* for tolower, toupper */
25
26 #define TRIMCHARSET 256 /* size of character set */
27
28
29
30
31 static ajint trimseq_trim(const AjPSeq seq, ajint sense, AjBool isnuc,
32 ajint window, float percent, AjBool strict,
33 AjBool star);
34 static void trimseq_parole(AjBool *gang, const char *good_guys);
35
36
37
38
39 /* @prog trimseq **************************************************************
40 **
41 ** Trim ambiguous bits off the ends of sequences
42 **
43 ******************************************************************************/
44
main(int argc,char ** argv)45 int main(int argc, char **argv)
46 {
47 AjPSeqall seqall;
48 AjPSeqout seqout;
49 AjPSeq seq = NULL;
50 ajint window;
51 AjBool left;
52 AjBool right;
53 AjBool strict;
54 AjBool star;
55 float percent;
56 AjBool isnuc;
57 AjPStr str = NULL;
58 ajint start;
59 ajint end;
60
61
62 embInit("trimseq", argc, argv);
63
64 seqall = ajAcdGetSeqall("sequence");
65 seqout = ajAcdGetSeqoutall("outseq");
66 window = ajAcdGetInt("window");
67 percent = ajAcdGetFloat("percent");
68 left = ajAcdGetBoolean("left");
69 right = ajAcdGetBoolean("right");
70 strict = ajAcdGetBoolean("strict");
71 star = ajAcdGetBoolean("star");
72
73 str = ajStrNew();
74
75 while(ajSeqallNext(seqall, &seq))
76 {
77 /* is this a protein or nucleic sequence? */
78 isnuc = ajSeqIsNuc(seq);
79
80 /* find the left start */
81 if(left)
82 start = trimseq_trim(seq, 1, isnuc, window, percent, strict, star)
83 + 1;
84 else
85 start = 0;
86
87 /* find the right end */
88 if(right)
89 end = trimseq_trim(seq, 0, isnuc, window, percent, strict, star)
90 - 1;
91 else
92 end = ajSeqGetLen(seq)-1;
93
94 /* get a COPY of the sequence string */
95 ajStrAssignS(&str, ajSeqGetSeqS(seq));
96
97 ajStrKeepRange(&str, start, end);
98 ajSeqAssignSeqS(seq, str);
99 ajSeqoutWriteSeq(seqout, seq);
100 }
101
102 ajSeqoutClose(seqout);
103
104 ajStrDel(&str);
105 ajSeqallDel(&seqall);
106 ajSeqDel(&seq);
107 ajSeqoutDel(&seqout);
108
109 embExit();
110
111 return 0;
112 }
113
114
115
116
117 /* @funcstatic trimseq_trim ***************************************************
118 **
119 ** Trim sequence
120 **
121 ** @param [r] seq [const AjPSeq] sequence
122 ** @param [r] sense [ajint] 0=right trim 1=left trim
123 ** @param [r] isnuc [AjBool] whether nucleic or protein
124 ** @param [r] window [ajint] window size
125 ** @param [r] percent [float] % ambiguity in window
126 ** @param [r] strict [AjBool] trim off all IUPAC ambiguity codes, not just X, N
127 ** @param [r] star [AjBool] trim off asterisks in proteins
128 ** @return [ajint] position to trim to or -1
129 ** or ajSeqGetLen(seq) if no bad characters were found
130 ** @@
131 ******************************************************************************/
132
trimseq_trim(const AjPSeq seq,ajint sense,AjBool isnuc,ajint window,float percent,AjBool strict,AjBool star)133 static ajint trimseq_trim(const AjPSeq seq,
134 ajint sense, AjBool isnuc, ajint window,
135 float percent, AjBool strict, AjBool star)
136 {
137 ajint leroy_brown; /* last bad character */
138 ajint suspect; /* possible last bad character */
139 AjBool gang[TRIMCHARSET]; /*
140 * all the characters - true if a bad one to
141 * be removed
142 */
143 ajint i;
144 ajint a; /* start position of window */
145 ajint z; /* position to end moving window to */
146 ajint inc; /* increment value to move window (+-1) */
147 ajint count; /* count of bad characters in window */
148 ajint look; /* position in wind we are looking at */
149 float pc; /* percentage of bad characters in this window */
150 char c;
151
152 /* set the characters to trim */
153 for(i=0; i<TRIMCHARSET; i++) /* set them all to be bad initially */
154 gang[i] = ajTrue;
155
156 if(isnuc)
157 {
158 /* normal bases and gap characters are good guys */
159 trimseq_parole(gang, "acgtu.-~ ");
160 if(!strict)
161 /* so are ambiguity codes if we are not strict */
162 trimseq_parole(gang, "mrwsykvhdb");
163 }
164 else
165 {
166 /*
167 ** protein
168 ** normal residues and gap characters are good guys
169 */
170 trimseq_parole(gang, "arndcqeghilkmfpstwyv.-~ ");
171 if(!strict)
172 /* so are ambiguity codes if not strict */
173 trimseq_parole(gang, "bz");
174
175 if(star)
176 /* so is an asterisk if needed */
177 trimseq_parole(gang, "*");
178 }
179
180 /* start loop - see direction */
181 if(sense)
182 {
183 a = 0;
184 z = ajSeqGetLen(seq) - window;
185 inc = 1;
186 leroy_brown = -1;
187 suspect = -1;
188 }
189 else
190 {
191 a = ajSeqGetLen(seq)-1;
192 z = window;
193 inc = -1;
194 leroy_brown = ajSeqGetLen(seq);
195 suspect = ajSeqGetLen(seq);
196 }
197
198 /*
199 ** do an initial trim of contiguous runs of bad characters from the ends
200 ** Always trim gaps from the end
201 **/
202 for(; a != z; a += inc)
203 {
204 c = (ajSeqGetSeqC(seq))[a];
205 if(gang[(ajint)c] || c == '.' || c == '-' || c == '~' || c == ' ')
206 /* trim if bad character or a gap character at the end */
207 leroy_brown = a; /* want to trim down to here */
208 else
209 break;
210 }
211
212 /* do the window trim of the remainder of the sequence */
213 for(; a != z; a += inc)
214 {
215 /* look in the window */
216 for(count = 0, look = 0; look < window && look > -window; look += inc)
217 {
218 c = (ajSeqGetSeqC(seq))[a+look];
219 if(gang[(ajint)c])
220 {
221 /* count the bad characters */
222 count++;
223 /* remember the last bad character position in this window */
224 suspect = a+look;
225 }
226 }
227
228 /* what is the percentage of bad characters in this window */
229 pc = (float)100.0 * (float)count/(float)window;
230 /* Need to trim this window? */
231 if(pc < percent)
232 break;
233
234 if(sense)
235 {
236 if(suspect > leroy_brown)
237 leroy_brown = suspect;
238 }
239 else
240 if(suspect < leroy_brown)
241 leroy_brown = suspect;
242 }
243
244 /*
245 ** do a final tidy up of gap characters left at the new end of the
246 ** sequence
247 ** Always trim gaps from the end
248 */
249 for(a = leroy_brown+inc; a != z; a += inc)
250 {
251 c = (ajSeqGetSeqC(seq))[a];
252 if(c == '.' || c == '-' || c == '~' || c == ' ')
253 /* trim if we have a gap character at the end */
254 leroy_brown = a; /* want to trim down to here */
255 else
256 break;
257 }
258
259 return leroy_brown;
260 }
261
262
263
264
265 /* @funcstatic trimseq_parole *************************************************
266 **
267 ** sets the upper and lowercase characters in the array 'gang' to be ajFalse
268 **
269 ** @param [w] gang [AjBool*] array of flags for whether a character
270 ** is required or not
271 ** @param [r] good_guys [const char*] string of chars that are required
272 ** @@
273 ******************************************************************************/
274
trimseq_parole(AjBool * gang,const char * good_guys)275 static void trimseq_parole(AjBool *gang, const char *good_guys)
276 {
277 ajint i;
278
279 for(i=0; good_guys[i]; i++)
280 {
281 gang[tolower((ajint) good_guys[i])] = ajFalse;
282 gang[toupper((ajint) good_guys[i])] = ajFalse;
283 }
284
285 return;
286 }
287