1 package dna; 2 import java.io.Serializable; 3 import java.util.ArrayList; 4 import java.util.Arrays; 5 import java.util.Locale; 6 7 import fileIO.ReadWrite; 8 import jgi.AssemblyStats2; 9 import shared.KillSwitch; 10 import shared.Shared; 11 import shared.Tools; 12 import structures.ByteBuilder; 13 import structures.Range; 14 15 16 public class ChromosomeArray implements Serializable { 17 18 19 /** 20 * 21 */ 22 private static final long serialVersionUID = 3199182397853127842L; 23 main(String[] args)24 public static void main(String[] args){ 25 translateFile(args[1], Byte.parseByte(args[0])); 26 } 27 28 translateFile(String fname, int chrom)29 private static void translateFile(String fname, int chrom){ 30 31 long time1=System.nanoTime(); 32 33 ChromosomeArray cha=read(fname, chrom); 34 cha.chromosome=chrom; 35 long time2=System.nanoTime(); 36 37 int dot=fname.lastIndexOf(".fa"); 38 String outfile=fname.substring(0,dot).replace("hs_ref_", "")+".chrom"; 39 40 System.out.println("Writing to "+outfile); 41 42 System.out.println("minIndex="+cha.minIndex+", maxIndex="+cha.maxIndex+", length="+cha.array.length+ 43 "; time="+String.format(Locale.ROOT, "%.3f seconds", (time2-time1)/1000000000d)); 44 45 long time3=System.nanoTime(); 46 ReadWrite.write(cha, outfile, false); 47 cha=null; 48 System.gc(); 49 cha=read(outfile); 50 long time4=System.nanoTime(); 51 52 System.out.println("minIndex="+cha.minIndex+", maxIndex="+cha.maxIndex+", length="+cha.array.length+ 53 "; time="+String.format(Locale.ROOT, "%.3f seconds", (time4-time3)/1000000000d)); 54 } 55 read(String fname, int chrom)56 public static ChromosomeArray read(String fname, int chrom){ 57 ChromosomeArray cha=read(fname); 58 assert(cha.chromosome<1); 59 cha.chromosome=chrom; 60 return cha; 61 } 62 63 public static ChromosomeArray read(String fname){ 64 65 // if(fname.endsWith(".chrom") || fname.endsWith(".chrom.gz")){} 66 ChromosomeArray ca=ReadWrite.read(ChromosomeArray.class, fname, false); 67 if(CHANGE_UNDEFINED_TO_N_ON_READ){ 68 ca.changeUndefinedToN(); 69 } 70 return ca; 71 } 72 73 public void changeUndefinedToN(){ 74 for(int i=0; i<array.length; i++){ 75 // array[i]=AminoAcid.numberToBase[AminoAcid.baseToNumberACGTother[array[i]]]; 76 if(!AminoAcid.isACGTN(array[i])){array[i]='N';} 77 } 78 } 79 80 public ChromosomeArray(){ 81 this((byte)-1, Shared.PLUS); 82 } 83 84 /** Actually does reverse complement */ 85 public ChromosomeArray complement(){ 86 byte otherStrand=(strand==Shared.MINUS ? Shared.PLUS : Shared.MINUS); 87 ChromosomeArray ca=new ChromosomeArray(chromosome, otherStrand, 0, maxIndex); 88 for(int i=0; i<=maxIndex; i++){ 89 int pos=maxIndex-i; 90 byte b=AminoAcid.baseToComplementExtended[array[i]]; 91 ca.array[pos]=b; 92 } 93 return ca; 94 } 95 96 public ChromosomeArray(int chrom, byte strnd){ 97 this(chrom, strnd, Integer.MAX_VALUE, -1); 98 } 99 100 public ChromosomeArray(int chrom, byte strnd, int min, int max){ 101 chromosome=chrom; 102 strand=strnd; 103 array=KillSwitch.allocByte1D(Tools.max(1000, max+1)); 104 minIndex=min; 105 maxIndex=max; 106 } 107 108 109 public void set(int loc, int val){ 110 111 if(loc>=array.length){//Increase size 112 int newlen=(int)(1+(3L*max(array.length, loc))/2); 113 assert(newlen>loc) : newlen+", "+loc+", "+array.length; 114 resize(newlen); 115 assert(array.length==newlen); 116 // System.err.println("Resized array to "+newlen); 117 } 118 if(CHANGE_U_TO_T && CHANGE_DEGENERATE_TO_N){ 119 val=AminoAcid.baseToACGTN[val]; 120 }else{ 121 val=Tools.toUpperCase((char)val); 122 if(AminoAcid.baseToNumberExtended[val]<0){val='N';} 123 } 124 array[loc]=(val>Byte.MAX_VALUE ? Byte.MAX_VALUE : (byte)val); 125 minIndex=min(loc, minIndex); 126 maxIndex=max(loc, maxIndex); 127 } 128 129 130 public void set(int loc, CharSequence s){ 131 int loc2=loc+s.length(); 132 if(loc2>array.length){//Increase size 133 int newlen=(int)(1+(3L*max(array.length, loc2))/2); 134 assert(newlen>loc2) : newlen+", "+loc2+", "+array.length; 135 resize(newlen); 136 assert(array.length==newlen); 137 // System.err.println("Resized array to "+newlen); 138 } 139 140 if(CHANGE_U_TO_T && CHANGE_DEGENERATE_TO_N){ 141 for(int i=0; i<s.length(); i++, loc++){ 142 array[loc]=AminoAcid.baseToACGTN[s.charAt(i)]; 143 } 144 }else{ 145 for(int i=0; i<s.length(); i++, loc++){ 146 char c=Tools.toUpperCase(s.charAt(i)); 147 if(AminoAcid.baseToNumberExtended[c]<0){c='N';} 148 assert(Tools.isLetter(c)); 149 assert(c<=Byte.MAX_VALUE); 150 array[loc]=(byte)c; 151 } 152 } 153 154 loc--; 155 assert(loc==loc2-1) : "loc="+loc+", loc2="+loc2+", s.len="+s.length(); 156 minIndex=min(loc, minIndex); 157 maxIndex=max(loc, maxIndex); 158 } 159 set(int loc, byte[] s)160 public void set(int loc, byte[] s){ 161 set(loc, s, s.length); 162 } 163 set(int loc, ByteBuilder bb)164 public void set(int loc, ByteBuilder bb){ 165 set(loc, bb.array, bb.length()); 166 } 167 set(int loc, byte[] s, final int slen)168 public void set(int loc, byte[] s, final int slen){ 169 assert(slen<=s.length && slen>=0); 170 int loc2=loc+slen; 171 if(loc2>array.length){//Increase size 172 int newlen=(int)(1+(3L*max(array.length, loc2))/2); 173 assert(newlen>loc2) : newlen+", "+loc2+", "+array.length; 174 resize(newlen); 175 assert(array.length==newlen); 176 // System.err.println("Resized array to "+newlen); 177 } 178 179 if(CHANGE_U_TO_T && CHANGE_DEGENERATE_TO_N){ 180 for(int i=0; i<slen; i++, loc++){ 181 byte b=(byte)Tools.max(0, s[i]); 182 array[loc]=AminoAcid.baseToACGTN[b]; 183 } 184 }else{ 185 for(int i=0; i<slen; i++, loc++){ 186 char c=Tools.max((char)0, Tools.toUpperCase((char)s[i])); 187 if(AminoAcid.baseToNumberExtended[c]<0){c='N';} 188 assert(Tools.isLetter(c)); 189 assert(c<=Byte.MAX_VALUE); 190 array[loc]=(byte)c; 191 } 192 } 193 loc--; 194 assert(loc==loc2-1) : "loc="+loc+", loc2="+loc2+", s.len="+slen; 195 minIndex=min(loc, minIndex); 196 maxIndex=max(loc, maxIndex); 197 } 198 199 /** 200 * @param loc 201 * @param length 202 * @param counts 203 * @return gc fraction 204 */ calcGC(int loc, int length, int[] counts)205 public float calcGC(int loc, int length, int[] counts) { 206 counts=countACGTINOC(loc, length, counts); 207 long at=counts[0]+counts[3]; 208 long gc=counts[1]+counts[2]; 209 return gc/(float)Tools.max(at+gc, 1); 210 } 211 212 /** 213 * @param loc 214 * @param length 215 * @return counts: {A, C, G, T, Iupac, N, Other, Control} 216 */ countACGTINOC(final int loc, final int length, int[] counts)217 public int[] countACGTINOC(final int loc, final int length, int[] counts) { 218 final int lim=loc+length; 219 assert(loc>=0 && lim<=maxIndex+1 && loc<=lim); 220 if(counts==null){counts=new int[8];} 221 else{Arrays.fill(counts, 0);} 222 assert(counts.length==8); 223 for(int i=loc; i<lim; i++){ 224 byte b=get(i); 225 int num=charToNum[b]; 226 counts[num]++; 227 } 228 return counts; 229 } 230 231 232 /** Returns the letter (IUPAC) representation of the base, as a byte */ get(int loc)233 public byte get(int loc){ 234 return loc<minIndex || loc>=maxIndex ? (byte)'N' : array[loc]; 235 } 236 getString(int a, int b)237 public String getString(int a, int b){ 238 StringBuilder sb=new StringBuilder(b-a+1); 239 for(int i=a; i<=b; i++){ 240 sb.append((char)get(i)); 241 } 242 return sb.toString(); 243 } 244 245 /** Returns FASTA format bytes. Same as getString, but faster. */ getBytes(int a, int b)246 public byte[] getBytes(int a, int b){ 247 byte[] out=KillSwitch.copyOfRange(array, a, b+1); 248 // assert(out[0]>0 && out[out.length-1]>0) : a+", "+b+", "+minIndex+", "+maxIndex+", "+array.length; 249 if(a<minIndex || b>maxIndex){ 250 for(int i=0; i<out.length; i++){ 251 if(out[i]==0){out[i]='N';} 252 } 253 } 254 return out; 255 } 256 getNumberACGTN(int loc)257 public byte getNumberACGTN(int loc){ 258 return AminoAcid.baseToNumberACGTN[array[loc]]; 259 } 260 getNumber(int loc)261 public byte getNumber(int loc){ 262 return AminoAcid.baseToNumber[array[loc]]; 263 } 264 isFullyDefined(int a, int b)265 public boolean isFullyDefined(int a, int b){ 266 for(int i=a; i<=b; i++){ 267 int x=AminoAcid.baseToNumber[array[i]]; 268 if(x<0){return false;} 269 } 270 return true; 271 } 272 isFullyUndefined(int a, int b)273 public boolean isFullyUndefined(int a, int b){ 274 for(int i=a; i<=b; i++){ 275 int x=AminoAcid.baseToNumber[array[i]]; 276 if(x>=0){return false;} 277 } 278 return true; 279 } 280 countDefinedBases()281 public int countDefinedBases(){ 282 return countDefinedBases(minIndex, maxIndex); 283 } 284 countDefinedBases(int a, int b)285 public int countDefinedBases(int a, int b){ 286 int sum=0; 287 for(int i=a; i<=b; i++){ 288 int x=AminoAcid.baseToNumber[array[i]]; 289 if(x>=0){sum++;} 290 } 291 return sum; 292 } 293 getNumber(int a, int b)294 public int getNumber(int a, int b){ 295 return toNumber(a, b, array); 296 } 297 toNumber(int a, int b, byte[] bases)298 public static int toNumber(int a, int b, byte[] bases){ 299 assert(b>=a); 300 assert(b-a<17); //<17 for unsigned, <16 for signed 301 int out=0; 302 for(int i=a; i<=b; i++){ 303 int x=AminoAcid.baseToNumber[bases[i]]; 304 if(x<0){return -1;} 305 out=((out<<2)|x); 306 } 307 return out; 308 } 309 toNumber(int a, int b, String bases)310 public static int toNumber(int a, int b, String bases){ 311 int out=0; 312 for(int i=a; i<=b; i++){ 313 int x=AminoAcid.baseToNumber[bases.charAt(i)]; 314 if(x<0){return -1;} 315 out=((out<<2)|x); 316 } 317 return out; 318 } 319 resize(int newlen)320 public void resize(int newlen){ 321 byte[] temp=KillSwitch.allocByte1D(newlen); 322 int lim=min(array.length, newlen); 323 assert(lim>=maxIndex) : lim+","+maxIndex; 324 for(int i=0; i<lim; i++){ 325 temp[i]=array[i]; 326 } 327 array=temp; 328 } 329 toBaseString()330 public String toBaseString(){ 331 String s=new String(array); 332 return s; 333 } 334 nearestDefinedBase()335 public char[] nearestDefinedBase(){ 336 char[] r=new char[array.length]; 337 final char max=Character.MAX_VALUE; 338 339 char dist=max; 340 for(int i=0; i<r.length; i++){ 341 byte b=array[i]; 342 if(b=='A' || b=='C' || b=='G' || b=='T'){ 343 dist=0; 344 }else{ 345 dist=(dist==max ? max : (char)(dist+1)); 346 } 347 r[i]=dist; 348 } 349 350 dist=r[r.length-1]; 351 for(int i=r.length-1; i>=0; i--){ 352 byte b=array[i]; 353 if(b=='A' || b=='C' || b=='G' || b=='T'){ 354 dist=0; 355 }else{ 356 dist=(dist==max ? max : (char)(dist+1)); 357 } 358 r[i]=Tools.min(dist, r[i]); 359 } 360 return r; 361 } 362 toContigRanges(final int nBlockSize)363 public ArrayList<Range> toContigRanges(final int nBlockSize){ 364 assert(nBlockSize>0); 365 ArrayList<Range> list=new ArrayList<Range>(); 366 367 int start=-1; 368 int stop=-1; 369 int ns=nBlockSize+1; 370 371 boolean contig=false; 372 373 for(int i=minIndex; i<=maxIndex; i++){ 374 byte b=array[i]; 375 if(b=='N' || b=='X'){ 376 ns++; 377 if(contig && (b=='X' || ns>=nBlockSize)){ 378 Range r=new Range(start, stop); 379 list.add(r); 380 contig=false; 381 } 382 }else{ 383 ns=0; 384 if(!contig){start=i;} 385 contig=true; 386 stop=i; 387 } 388 } 389 if(contig){ 390 Range r=new Range(start, stop); 391 list.add(r); 392 } 393 return list; 394 } 395 396 equalsIgnoreCase(ChromosomeArray other)397 public boolean equalsIgnoreCase(ChromosomeArray other){ 398 if(minIndex!=other.minIndex){System.err.println("a");return false;} 399 if(maxIndex!=other.maxIndex){System.err.println("b");return false;} 400 if(chromosome!=other.chromosome){System.err.println("c");return false;} 401 if(array.length!=other.array.length){System.err.println("d");return false;} 402 for(int i=minIndex; i<=maxIndex; i++){ 403 if(Tools.toLowerCase(array[i])!=Tools.toLowerCase(other.array[i])){ 404 System.err.println("e"); 405 return false; 406 } 407 } 408 return true; 409 } 410 min(long x, long y)411 private static final long min(long x, long y){return x<y ? x : y;} max(long x, long y)412 private static final long max(long x, long y){return x>y ? x : y;} min(int x, int y)413 private static final int min(int x, int y){return x<y ? x : y;} max(int x, int y)414 private static final int max(int x, int y){return x>y ? x : y;} 415 416 public final byte strand; 417 public int chromosome; 418 public byte[] array; 419 public int maxIndex=-1; 420 public int minIndex=Integer.MAX_VALUE; 421 422 public static boolean CHANGE_UNDEFINED_TO_N_ON_READ=false; 423 public static boolean CHANGE_U_TO_T=true; 424 public static boolean CHANGE_DEGENERATE_TO_N=true; 425 426 /** Translation array for tracking base counts */ 427 private static final byte[] charToNum=AssemblyStats2.makeCharToNum(); 428 429 430 } 431