1 /* KarlinSigAlgorithm.java
2  *
3  * created: Thu Aug  9 2001
4  *
5  * This file is part of Artemis
6  *
7  * Copyright (C) 1999  Genome Research Limited
8  *
9  * This program is free software; you can redistribute it and/or
10  * modify it under the terms of the GNU General Public License
11  * as published by the Free Software Foundation; either version 2
12  * of the License, or (at your option) any later version.
13  *
14  * This program is distributed in the hope that it will be useful,
15  * but WITHOUT ANY WARRANTY; without even the implied warranty of
16  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
17  * GNU General Public License for more details.
18  *
19  * You should have received a copy of the GNU General Public License
20  * along with this program; if not, write to the Free Software
21  * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA  02111-1307, USA.
22  *
23  * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/plot/KarlinSigAlgorithm.java,v 1.4 2008-06-27 10:01:30 tjc Exp $
24  */
25 
26 package uk.ac.sanger.artemis.plot;
27 
28 import uk.ac.sanger.artemis.util.*;
29 import uk.ac.sanger.artemis.io.Range;
30 
31 import uk.ac.sanger.artemis.sequence.*;
32 
33 /**
34  *  This class contains the code for calculating the absolute "genomic
35  *  signature" difference.
36  *  For details see "Global dinucleotide signatures and analysis of genomic
37  *  heterogeneity" Samuel Karlin - Current Opinion in Microbiology 1998,
38  *  1:598-610.
39  *  Objects of this class have one useful method - getValues (), which takes a
40  *  range of bases and returns three floating point numbers, which are the
41  *  codon usage scores in each frame.  The Strand to use is set in the
42  *  constructor.
43  *
44  *  @author Kim Rutherford
45  *  @version $Id: KarlinSigAlgorithm.java,v 1.4 2008-06-27 10:01:30 tjc Exp $
46  **/
47 
48 public class KarlinSigAlgorithm extends BaseAlgorithm {
49   /**
50    *  Create a new KarlinSigAlgorithm object.
51    *  @param strand The Strand to do the calculation on.
52    **/
KarlinSigAlgorithm(final Strand strand)53   public KarlinSigAlgorithm (final Strand strand) {
54     super (strand, "Karlin Signature Difference", "karlin_sig");
55 
56     setScalingFlag (true);
57   }
58 
59   /**
60    *  Return the Karlin genomic signature for the given range of bases.
61    *  @param start The start base (included in the range).
62    *  @param end The end base (included in the range).  If the start/end pair
63    *    doesn't give a multiple of three bases end is moved down so that it is
64    *    a multiple of three.
65    *  @param values The one return value for this algorithm is returned in
66    *    this array.
67    **/
getValues(int start, int end, final float [] values)68   public void getValues (int start, int end, final float [] values) {
69     // add 1 or 2 if necessary to make the range a multiple of 3
70     end -= (end - start + 1) % 3;
71 
72     final char[] sub_sequence;
73 
74     try {
75       // reset
76       if(end-start > 1000)
77         ((uk.ac.sanger.artemis.io.StreamSequence)(getStrand().getBases().getSequence())).forceReset();
78       sub_sequence = getStrand().getRawSubSequenceC(new Range (start, end));
79     } catch (OutOfRangeException e) {
80       throw new Error ("internal error - unexpected exception: " + e);
81     }
82 
83     final float [][] global_relative_abundance_values =
84       getGlobalRelativeAbundance ();
85     final float [][] subseq_relative_abundance_values =
86       getRelativeAbundance (sub_sequence);
87 
88     float signature_difference = 0;
89 
90     for (int first_base = 0 ; first_base < 4 ; ++first_base) {
91       for (int second_base = 0 ; second_base < 4 ; ++second_base) {
92         final float global_value =
93           global_relative_abundance_values[first_base][second_base];
94         final float subseq_value =
95           subseq_relative_abundance_values[first_base][second_base];
96         signature_difference += Math.abs (global_value - subseq_value);
97       }
98     }
99 
100     values [0] = (float) signature_difference / 16f ;
101   }
102 
103   /**
104    *  Return the number of values a call to getValues () will return - three
105    *  in this case.
106    **/
getValueCount()107   public int getValueCount () {
108     return 1;
109   }
110 
111   /**
112    *  Return the default or optimal window size.
113    *  @return null is returned if this algorithm doesn't have optimal window
114    *    size.
115    **/
getDefaultWindowSize()116   public Integer getDefaultWindowSize () {
117     final Integer super_window_size = super.getDefaultWindowSize ();
118     if (super_window_size != null) {
119       // the superclass version of getDefaultWindowSize () returns non-null
120       // iff the user has set the window size in the options file
121       return super_window_size;
122     }
123     return new Integer (240);
124   }
125 
126   /**
127    *  Return the default maximum window size for this algorithm.
128    *  @return null is returned if this algorithm doesn't have maximum window
129    *    size.
130    **/
getDefaultMaxWindowSize()131   public Integer getDefaultMaxWindowSize () {
132     final Integer super_max_window_size = super.getDefaultMaxWindowSize ();
133     if (super_max_window_size != null) {
134       // the superclass version of getDefaultMaxWindowSize () returns non-null
135       // iff the user has set the max window size in the options file
136       return super_max_window_size;
137     }
138     return new Integer (5000);
139   }
140 
141   /**
142    *  Return the default minimum window size for this algorithm.
143    *  @return null is returned if this algorithm doesn't have minimum window
144    *    size.
145    **/
getDefaultMinWindowSize()146   public Integer getDefaultMinWindowSize () {
147     final Integer super_min_window_size = super.getDefaultMinWindowSize ();
148     if (super_min_window_size != null) {
149       // the superclass version of getDefaultMinWindowSize () returns non-null
150       // iff the user has set the minimum window size in the options file
151       return super_min_window_size;
152     }
153     return new Integer (24);
154   }
155 
156   /**
157    *  Return the default or optimal step size.
158    *  @return null is returned if this algorithm doesn't have optimal step
159    *    size.
160    **/
getDefaultStepSize(int window_size)161   public Integer getDefaultStepSize (int window_size) {
162     if (window_size > 10) {
163       return new Integer (window_size / 10);
164     } else {
165       return null;
166     }
167   }
168 
169   /**
170    *  Return the maximum value of this algorithm.
171    *  @return The maximum is 2.
172    **/
getMaximumInternal()173   protected Float getMaximumInternal () {
174     return new Float (2);
175   }
176 
177   /**
178    *  Return the minimum value of this algorithm.
179    *  @return The minimum is 0.
180    **/
getMinimumInternal()181   protected Float getMinimumInternal () {
182     return new Float (0);
183   }
184 
185   /**
186    *  Return the average value of function over the whole strand.
187    *  @return null is returned if this algorithm doesn't have an average or if
188    *    the average can't be calculated.
189    **/
getAverage()190   public Float getAverage () {
191     return null;
192   }
193 
194   /**
195    *  Return a 4x4 array containing the relative abundance values for each
196    *  dinucleotide pair.  Indexed by base (t,c,a,g).  The value for the
197    *  dinucleotide "TT" is stored in global_signature[0][0], "TC" is stored in
198    *  [0][1], etc.
199    **/
getRelativeAbundance(final char [] sequence_forward_raw)200   private float [][] getRelativeAbundance (final char [] sequence_forward_raw) {
201     final float [][] return_value = new float [4][4];
202 
203     final char [] sequence_reverse_raw =
204       Bases.reverseComplement (sequence_forward_raw);
205 
206     final int [] base_counts = new int [4];
207     final int [][] dinucleotide_base_counts = new int [4][4];
208 
209     int this_f_base_index = Bases.getIndexOfBase (sequence_forward_raw[0]);
210     int next_f_base_index = 0;
211     int this_r_base_index = Bases.getIndexOfBase (sequence_reverse_raw[0]);
212     int next_r_base_index = 0;
213 
214     for (int i = 0 ; i < sequence_forward_raw.length - 1 ; ++i)
215     {
216       next_f_base_index = Bases.getIndexOfBase (sequence_forward_raw[i + 1]);
217 
218       if (this_f_base_index < 4 && next_f_base_index < 4) {
219         ++base_counts[this_f_base_index];
220         ++dinucleotide_base_counts[this_f_base_index][next_f_base_index];
221       } else {
222         // ignore Ns
223       }
224 
225       next_r_base_index = Bases.getIndexOfBase (sequence_reverse_raw[i + 1]);
226 
227       if (this_r_base_index < 4 && next_r_base_index < 4) {
228         ++base_counts[this_r_base_index];
229         ++dinucleotide_base_counts[this_r_base_index][next_r_base_index];
230       } else {
231         // ignore Ns
232       }
233 
234       this_f_base_index = next_f_base_index;
235       this_r_base_index = next_r_base_index;
236     }
237 
238     // remember to add the last base
239     if (next_f_base_index < 4)
240       ++base_counts[next_f_base_index];
241 
242     if (next_r_base_index < 4)
243       ++base_counts[next_r_base_index];
244 
245 
246     for (int first_base_index = 0 ;
247          first_base_index < 4 ;
248          ++first_base_index)
249     {
250       for (int second_base_index = 0 ;
251            second_base_index < 4 ;
252            ++second_base_index)
253       {
254         final float dinucleotide_frequency =
255           1f * dinucleotide_base_counts[first_base_index][second_base_index] /
256           (sequence_reverse_raw.length - 1) / 2;
257         final float first_base_frequency =
258           1f * base_counts[first_base_index] /
259           sequence_reverse_raw.length / 2;
260         final float second_base_frequency =
261           1f * base_counts[second_base_index] /
262           sequence_reverse_raw.length / 2;
263 
264         return_value[first_base_index][second_base_index] =
265           dinucleotide_frequency /
266           (first_base_frequency * second_base_frequency);
267       }
268     }
269 
270     return return_value;
271   }
272 
273   /**
274    *  Return the relative abundance values for the complete sequence.  Indexed
275    *  by base (t,c,a,g).  The value for the dinucleotide "TT" is stored in
276    *  global_signature[0][0], "TC" is stored in [0][1], etc.
277    **/
getGlobalRelativeAbundance()278   private float [][] getGlobalRelativeAbundance () {
279     if (global_relative_abundance_values == null) {
280       try {
281         final Range whole_range =
282           new Range (1, getStrand ().getSequenceLength ());
283         final char[] sequence =
284           getStrand().getRawSubSequenceC(whole_range);
285         global_relative_abundance_values = getRelativeAbundance (sequence);
286       } catch (OutOfRangeException e) {
287         throw new Error ("internal error - unexpected exception: " + e);
288       }
289     }
290 
291     return global_relative_abundance_values;
292   }
293 
294   /**
295    *  Storage for relative abundance values for the complete sequence.
296    *  Indexed by base (t,c,a,g).  The value for the dinucleotide "TT" is
297    *  stored in global_signature[0][0], "TC" is stored in [0][1], etc.
298    **/
299   private float [][] global_relative_abundance_values = null;
300 }
301