1 /* KarlinSigAlgorithm.java 2 * 3 * created: Thu Aug 9 2001 4 * 5 * This file is part of Artemis 6 * 7 * Copyright (C) 1999 Genome Research Limited 8 * 9 * This program is free software; you can redistribute it and/or 10 * modify it under the terms of the GNU General Public License 11 * as published by the Free Software Foundation; either version 2 12 * of the License, or (at your option) any later version. 13 * 14 * This program is distributed in the hope that it will be useful, 15 * but WITHOUT ANY WARRANTY; without even the implied warranty of 16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 17 * GNU General Public License for more details. 18 * 19 * You should have received a copy of the GNU General Public License 20 * along with this program; if not, write to the Free Software 21 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. 22 * 23 * $Header: //tmp/pathsoft/artemis/uk/ac/sanger/artemis/plot/KarlinSigAlgorithm.java,v 1.4 2008-06-27 10:01:30 tjc Exp $ 24 */ 25 26 package uk.ac.sanger.artemis.plot; 27 28 import uk.ac.sanger.artemis.util.*; 29 import uk.ac.sanger.artemis.io.Range; 30 31 import uk.ac.sanger.artemis.sequence.*; 32 33 /** 34 * This class contains the code for calculating the absolute "genomic 35 * signature" difference. 36 * For details see "Global dinucleotide signatures and analysis of genomic 37 * heterogeneity" Samuel Karlin - Current Opinion in Microbiology 1998, 38 * 1:598-610. 39 * Objects of this class have one useful method - getValues (), which takes a 40 * range of bases and returns three floating point numbers, which are the 41 * codon usage scores in each frame. The Strand to use is set in the 42 * constructor. 43 * 44 * @author Kim Rutherford 45 * @version $Id: KarlinSigAlgorithm.java,v 1.4 2008-06-27 10:01:30 tjc Exp $ 46 **/ 47 48 public class KarlinSigAlgorithm extends BaseAlgorithm { 49 /** 50 * Create a new KarlinSigAlgorithm object. 51 * @param strand The Strand to do the calculation on. 52 **/ KarlinSigAlgorithm(final Strand strand)53 public KarlinSigAlgorithm (final Strand strand) { 54 super (strand, "Karlin Signature Difference", "karlin_sig"); 55 56 setScalingFlag (true); 57 } 58 59 /** 60 * Return the Karlin genomic signature for the given range of bases. 61 * @param start The start base (included in the range). 62 * @param end The end base (included in the range). If the start/end pair 63 * doesn't give a multiple of three bases end is moved down so that it is 64 * a multiple of three. 65 * @param values The one return value for this algorithm is returned in 66 * this array. 67 **/ getValues(int start, int end, final float [] values)68 public void getValues (int start, int end, final float [] values) { 69 // add 1 or 2 if necessary to make the range a multiple of 3 70 end -= (end - start + 1) % 3; 71 72 final char[] sub_sequence; 73 74 try { 75 // reset 76 if(end-start > 1000) 77 ((uk.ac.sanger.artemis.io.StreamSequence)(getStrand().getBases().getSequence())).forceReset(); 78 sub_sequence = getStrand().getRawSubSequenceC(new Range (start, end)); 79 } catch (OutOfRangeException e) { 80 throw new Error ("internal error - unexpected exception: " + e); 81 } 82 83 final float [][] global_relative_abundance_values = 84 getGlobalRelativeAbundance (); 85 final float [][] subseq_relative_abundance_values = 86 getRelativeAbundance (sub_sequence); 87 88 float signature_difference = 0; 89 90 for (int first_base = 0 ; first_base < 4 ; ++first_base) { 91 for (int second_base = 0 ; second_base < 4 ; ++second_base) { 92 final float global_value = 93 global_relative_abundance_values[first_base][second_base]; 94 final float subseq_value = 95 subseq_relative_abundance_values[first_base][second_base]; 96 signature_difference += Math.abs (global_value - subseq_value); 97 } 98 } 99 100 values [0] = (float) signature_difference / 16f ; 101 } 102 103 /** 104 * Return the number of values a call to getValues () will return - three 105 * in this case. 106 **/ getValueCount()107 public int getValueCount () { 108 return 1; 109 } 110 111 /** 112 * Return the default or optimal window size. 113 * @return null is returned if this algorithm doesn't have optimal window 114 * size. 115 **/ getDefaultWindowSize()116 public Integer getDefaultWindowSize () { 117 final Integer super_window_size = super.getDefaultWindowSize (); 118 if (super_window_size != null) { 119 // the superclass version of getDefaultWindowSize () returns non-null 120 // iff the user has set the window size in the options file 121 return super_window_size; 122 } 123 return new Integer (240); 124 } 125 126 /** 127 * Return the default maximum window size for this algorithm. 128 * @return null is returned if this algorithm doesn't have maximum window 129 * size. 130 **/ getDefaultMaxWindowSize()131 public Integer getDefaultMaxWindowSize () { 132 final Integer super_max_window_size = super.getDefaultMaxWindowSize (); 133 if (super_max_window_size != null) { 134 // the superclass version of getDefaultMaxWindowSize () returns non-null 135 // iff the user has set the max window size in the options file 136 return super_max_window_size; 137 } 138 return new Integer (5000); 139 } 140 141 /** 142 * Return the default minimum window size for this algorithm. 143 * @return null is returned if this algorithm doesn't have minimum window 144 * size. 145 **/ getDefaultMinWindowSize()146 public Integer getDefaultMinWindowSize () { 147 final Integer super_min_window_size = super.getDefaultMinWindowSize (); 148 if (super_min_window_size != null) { 149 // the superclass version of getDefaultMinWindowSize () returns non-null 150 // iff the user has set the minimum window size in the options file 151 return super_min_window_size; 152 } 153 return new Integer (24); 154 } 155 156 /** 157 * Return the default or optimal step size. 158 * @return null is returned if this algorithm doesn't have optimal step 159 * size. 160 **/ getDefaultStepSize(int window_size)161 public Integer getDefaultStepSize (int window_size) { 162 if (window_size > 10) { 163 return new Integer (window_size / 10); 164 } else { 165 return null; 166 } 167 } 168 169 /** 170 * Return the maximum value of this algorithm. 171 * @return The maximum is 2. 172 **/ getMaximumInternal()173 protected Float getMaximumInternal () { 174 return new Float (2); 175 } 176 177 /** 178 * Return the minimum value of this algorithm. 179 * @return The minimum is 0. 180 **/ getMinimumInternal()181 protected Float getMinimumInternal () { 182 return new Float (0); 183 } 184 185 /** 186 * Return the average value of function over the whole strand. 187 * @return null is returned if this algorithm doesn't have an average or if 188 * the average can't be calculated. 189 **/ getAverage()190 public Float getAverage () { 191 return null; 192 } 193 194 /** 195 * Return a 4x4 array containing the relative abundance values for each 196 * dinucleotide pair. Indexed by base (t,c,a,g). The value for the 197 * dinucleotide "TT" is stored in global_signature[0][0], "TC" is stored in 198 * [0][1], etc. 199 **/ getRelativeAbundance(final char [] sequence_forward_raw)200 private float [][] getRelativeAbundance (final char [] sequence_forward_raw) { 201 final float [][] return_value = new float [4][4]; 202 203 final char [] sequence_reverse_raw = 204 Bases.reverseComplement (sequence_forward_raw); 205 206 final int [] base_counts = new int [4]; 207 final int [][] dinucleotide_base_counts = new int [4][4]; 208 209 int this_f_base_index = Bases.getIndexOfBase (sequence_forward_raw[0]); 210 int next_f_base_index = 0; 211 int this_r_base_index = Bases.getIndexOfBase (sequence_reverse_raw[0]); 212 int next_r_base_index = 0; 213 214 for (int i = 0 ; i < sequence_forward_raw.length - 1 ; ++i) 215 { 216 next_f_base_index = Bases.getIndexOfBase (sequence_forward_raw[i + 1]); 217 218 if (this_f_base_index < 4 && next_f_base_index < 4) { 219 ++base_counts[this_f_base_index]; 220 ++dinucleotide_base_counts[this_f_base_index][next_f_base_index]; 221 } else { 222 // ignore Ns 223 } 224 225 next_r_base_index = Bases.getIndexOfBase (sequence_reverse_raw[i + 1]); 226 227 if (this_r_base_index < 4 && next_r_base_index < 4) { 228 ++base_counts[this_r_base_index]; 229 ++dinucleotide_base_counts[this_r_base_index][next_r_base_index]; 230 } else { 231 // ignore Ns 232 } 233 234 this_f_base_index = next_f_base_index; 235 this_r_base_index = next_r_base_index; 236 } 237 238 // remember to add the last base 239 if (next_f_base_index < 4) 240 ++base_counts[next_f_base_index]; 241 242 if (next_r_base_index < 4) 243 ++base_counts[next_r_base_index]; 244 245 246 for (int first_base_index = 0 ; 247 first_base_index < 4 ; 248 ++first_base_index) 249 { 250 for (int second_base_index = 0 ; 251 second_base_index < 4 ; 252 ++second_base_index) 253 { 254 final float dinucleotide_frequency = 255 1f * dinucleotide_base_counts[first_base_index][second_base_index] / 256 (sequence_reverse_raw.length - 1) / 2; 257 final float first_base_frequency = 258 1f * base_counts[first_base_index] / 259 sequence_reverse_raw.length / 2; 260 final float second_base_frequency = 261 1f * base_counts[second_base_index] / 262 sequence_reverse_raw.length / 2; 263 264 return_value[first_base_index][second_base_index] = 265 dinucleotide_frequency / 266 (first_base_frequency * second_base_frequency); 267 } 268 } 269 270 return return_value; 271 } 272 273 /** 274 * Return the relative abundance values for the complete sequence. Indexed 275 * by base (t,c,a,g). The value for the dinucleotide "TT" is stored in 276 * global_signature[0][0], "TC" is stored in [0][1], etc. 277 **/ getGlobalRelativeAbundance()278 private float [][] getGlobalRelativeAbundance () { 279 if (global_relative_abundance_values == null) { 280 try { 281 final Range whole_range = 282 new Range (1, getStrand ().getSequenceLength ()); 283 final char[] sequence = 284 getStrand().getRawSubSequenceC(whole_range); 285 global_relative_abundance_values = getRelativeAbundance (sequence); 286 } catch (OutOfRangeException e) { 287 throw new Error ("internal error - unexpected exception: " + e); 288 } 289 } 290 291 return global_relative_abundance_values; 292 } 293 294 /** 295 * Storage for relative abundance values for the complete sequence. 296 * Indexed by base (t,c,a,g). The value for the dinucleotide "TT" is 297 * stored in global_signature[0][0], "TC" is stored in [0][1], etc. 298 **/ 299 private float [][] global_relative_abundance_values = null; 300 } 301