1 /* GCSDWindowAlgorithm.java
2  *
3  * created: Mon Oct 25 1999
4  *
5  * This file is part of Artemis
6  *
7  * Copyright (C) 1999  Genome Research Limited
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License
11  * as published by the Free Software Foundation; either version 2
12  * of the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
22  *
23  * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/plot/GCSDWindowAlgorithm.java,v 1.4 2009-03-17 17:47:42 tjc Exp $
24  */
25 
26 package uk.ac.sanger.artemis.plot;
27 
28 import uk.ac.sanger.artemis.util.*;
29 import uk.ac.sanger.artemis.io.Range;
30 
31 import uk.ac.sanger.artemis.sequence.*;
32 
33 /**
34  *  Objects of this class have one useful method - getValues (), which takes a
35  *  range of bases and returns a single floating point number, which is the
36  *  percent GC content of the range if the GC content of the range is more
37  *  than 2.5 standard deviations from the average GC content of the sequence.
38  *  It returns the average GC content of the sequence otherwise.  The Strand
39  *  to use is set in the constructor.
40  *
41  *  @author Kim Rutherford
42  *  @version $Id: GCSDWindowAlgorithm.java,v 1.4 2009-03-17 17:47:42 tjc Exp $
43  **/
44 
45 public class GCSDWindowAlgorithm extends BaseAlgorithm {
46   /**
47    *  Create a new GCSDWindowAlgorithm object.
48    *  @param strand The strand to do the calculation on.
49    **/
GCSDWindowAlgorithm(final Strand strand)50   public GCSDWindowAlgorithm (final Strand strand) {
51     super (strand, "GC Content (%) With A 2.5 SD Cutoff", "sd_gc_content");
52 
53     setScalingFlag (false);
54   }
55 
56   /**
57    *  Return the percent GC between a pair of bases if the GC content of the
58    *  range is more than 2.5 standard deviations from the average GC content
59    *  of the sequence.  It returns the average GC content of the sequence
60    *  otherwise.
61    *  @param start The start base (included in the range).
62    *  @param end The end base (included in the range).
63    *  @param values The one return value for this algorithm is returned in
64    *    this array.
65    **/
getValues(int start, int end, float [] values)66   public void getValues (int start, int end, float [] values) {
67     final int window_size = end - start + 1;
68 
69     final float standard_deviation;
70 
71     if (window_size > getDefaultMaxWindowSize ().intValue ()) {
72       standard_deviation = calculateSD (window_size);
73     } else {
74       if (standard_deviations[window_size - 1] < 0) {
75         // set the cached value
76         standard_deviation = calculateSD (window_size);
77         standard_deviations[window_size - 1] = standard_deviation;
78 
79         //       System.err.println ("SD: " + standard_deviation);
80       } else {
81         standard_deviation = standard_deviations[window_size - 1];
82       }
83     }
84 
85     final String sequence;
86 
87     try {
88       sequence = getStrand ().getSubSequence (new Range (start, end));
89     } catch (OutOfRangeException e) {
90       throw new Error ("internal error - unexpected exception: " + e);
91     }
92 
93     float gc_count = 0;
94 
95     for (int i = 0 ; i < sequence.length () ; ++i) {
96       final char this_char = sequence.charAt (i);
97 //      System.out.println (this_char);
98 
99       if (this_char == 'g' || this_char == 'c') {
100         ++gc_count;
101       }
102     }
103 
104     final float gc_content = gc_count/sequence.length () * 100;
105 
106     final float gc_average =
107       getStrand ().getBases ().getAverageGCPercent ();
108 
109     if (Math.abs (gc_content - gc_average) < standard_deviation * 2.5) {
110       values[0] = gc_average;
111     } else {
112       values[0] = gc_content;
113     }
114   }
115 
116   /**
117    *  Calculate and return the standard deviation of the GC content of the
118    *  Bases object of the Strand that was passed to the constructor.
119    **/
calculateSD(final int window_size)120   private float calculateSD (final int window_size) {
121 //      System.err.println ("calculateSD:");
122 
123     final int sequence_length = getStrand ().getBases ().getLength ();
124 
125     final String bases = getStrand ().getBases ().toString ();
126 
127     // the number of windows to search
128     final int window_count = sequence_length - window_size;
129 
130 //      System.err.println ("window_size: " + window_size);
131 //      System.err.println ("window_count: " + window_count);
132 
133     // this is the sum over all the windows of:
134     // (GC content) * (percent GC) / window_size * window_size
135     double sum_so_far = 0;
136 
137     // this is the sum over all the windows of (GC content)/window_size
138     double gc_so_far = 0;
139 
140     double current_gc_count = 0;
141 
142     for (int i = - window_size ; i < window_count ; ++i) {
143       if (i > 0) {
144         final char previous_char = bases.charAt (i - 1);
145 
146         if (previous_char == 'g' || previous_char == 'c') {
147           --current_gc_count;
148         }
149       }
150 
151       final char new_char = bases.charAt (i + window_size);
152 
153       if (new_char == 'g' || new_char == 'c') {
154         ++current_gc_count;
155       }
156 
157       if (i >= 0) {
158         final double this_value = current_gc_count / window_size;
159 
160         sum_so_far += this_value * this_value;
161 
162         gc_so_far += this_value;
163       }
164     }
165 
166 //      System.err.println ("sum_so_far: " + sum_so_far);
167 //      System.err.println ("gc_so_far: " + gc_so_far);
168 
169     final double gc_average = gc_so_far / window_count;
170 
171 //      System.err.println ("gc_average: " + gc_average);
172 //      System.err.println ("sum_so_far/window_count: " +
173 //                          (sum_so_far/window_count));
174 //      System.err.println ("gc_average*gc_average: " + (gc_average * gc_average));
175 
176 //      System.err.println ("sd*sd: " + (sum_so_far / window_count -
177 //                                       gc_average * gc_average));
178 
179     return (float) Math.sqrt (sum_so_far / window_count -
180                               gc_average * gc_average) * 100;
181   }
182 
183   /**
184    *  Return the number of values a call to getValues () will return - one
185    *  in this case.
186    **/
getValueCount()187   public int getValueCount () {
188     return 1;
189   }
190 
191   /**
192    *  Return the default or optimal window size.
193    *  @return null is returned if this algorithm doesn't have optimal window
194    *    size.
195    **/
getDefaultWindowSize()196   public Integer getDefaultWindowSize () {
197     final Integer super_window_size = super.getDefaultWindowSize ();
198     if (super_window_size != null) {
199       // the superclass version of getDefaultWindowSize () returns non-null
200       // iff the user has set the window size in the options file
201       return super_window_size;
202     }
203     return new Integer (1000);
204   }
205 
206   /**
207    *  Return the default maximum window size for this algorithm.
208    *  @return null is returned if this algorithm doesn't have maximum window
209    *    size.
210    **/
getDefaultMaxWindowSize()211   public Integer getDefaultMaxWindowSize () {
212     final Integer super_max_window_size = super.getDefaultMaxWindowSize ();
213     if (super_max_window_size != null) {
214       // the superclass version of getDefaultMaxWindowSize () returns non-null
215       // iff the user has set the max window size in the options file
216       return super_max_window_size;
217     }
218     return new Integer (5000);
219   }
220 
221   /**
222    *  Return the default minimum window size for this algorithm.
223    *  @return null is returned if this algorithm doesn't have minimum window
224    *    size.
225    **/
getDefaultMinWindowSize()226   public Integer getDefaultMinWindowSize () {
227     final Integer super_min_window_size = super.getDefaultMinWindowSize ();
228     if (super_min_window_size != null) {
229       // the superclass version of getDefaultMinWindowSize () returns non-null
230       // iff the user has set the minimum window size in the options file
231       return super_min_window_size;
232     }
233     return new Integer (100);
234   }
235 
236   /**
237    *  Return the default or optimal step size.
238    *  @return null is returned if this algorithm doesn't have optimal step
239    *    size.
240    **/
getDefaultStepSize(int window_size)241   public Integer getDefaultStepSize (int window_size) {
242     return new Integer (1);
243   }
244 
245   /**
246    *  Return the average value of function over the whole strand.
247    *  @return null is returned if this algorithm doesn't have an average or if
248    *    the average can't be calculated.
249    **/
getAverage()250   public Float getAverage () {
251     return new Float (getStrand ().getBases ().getAverageGCPercent ());
252   }
253 
254   /**
255    *  Return the maximum value of this algorithm.
256    *  @return The maximum is 100.
257    **/
getMaximumInternal()258   protected Float getMaximumInternal () {
259     return new Float (100);
260   }
261 
262   /**
263    *  Return the minimum value of this algorithm.
264    *  @return The minimum is 0.
265    **/
getMinimumInternal()266   protected Float getMinimumInternal () {
267     return new Float (0);
268   }
269 
270   /**
271    *  This array contains a cache of the standard deviations of the GC content
272    *  of the Bases object of the Strand that was passed to the constructor.
273    *  (Set by the constructor).  The array is indexed by window size.
274    **/
275   float [] standard_deviations;
276 
277   {
278     final int max_window_size = getDefaultMaxWindowSize ().intValue ();
279     standard_deviations = new float [max_window_size];
280 
281     for (int i = 0 ; i < max_window_size ; ++i) {
282       standard_deviations[i] = -1;
283     }
284   }
285 }
286