1 package pacbio; 2 3 import java.io.File; 4 import java.util.ArrayList; 5 6 import dna.ChromosomeArray; 7 import dna.Data; 8 import dna.FastaToChromArrays2; 9 import fileIO.ReadWrite; 10 import fileIO.TextFile; 11 import fileIO.TextStreamWriter; 12 import shared.Parse; 13 import shared.PreParser; 14 import shared.Timer; 15 import shared.Tools; 16 import stream.SiteScore; 17 import structures.CoverageArray; 18 import structures.CoverageArray2; 19 import structures.Range; 20 21 /** 22 * @author Brian Bushnell 23 * @date Jul 26, 2012 24 * 25 */ 26 public class SplitOffPerfectContigs { 27 main(String[] args)28 public static void main(String[] args){ 29 {//Preparse block for help, config files, and outstream 30 PreParser pp=new PreParser(args, new Object() { }.getClass().getEnclosingClass(), false); 31 args=pp.args; 32 //outstream=pp.outstream; 33 } 34 35 Timer t=new Timer(); 36 37 // ChromosomeArray c=new ChromosomeArray(1, (byte)1, "ANNNAAAANAAANNA"); 38 // System.out.println(c.toContigRanges(3)); 39 // System.out.println(c.toContigRanges(2)); 40 // System.out.println(c.toContigRanges(1)); 41 // assert(false); 42 43 44 Data.GENOME_BUILD=-1; 45 String dest=null; 46 String covfile=null; 47 String sitesfile=null; 48 String contigfile=null; 49 int trigger=50; 50 int blocklen=100; 51 int mincoverage=2; 52 int padding=4; 53 int buildout=-1; 54 String name=null; 55 String source=null; 56 57 58 for(int i=0; i<args.length; i++){ 59 final String arg=args[i]; 60 final String[] split=arg.split("="); 61 String a=split[0].toLowerCase(); 62 String b=split.length>1 ? split[1] : null; 63 64 if(a.equals("genome") || a.equals("build")){ 65 Data.setGenome(Integer.parseInt(b)); 66 name=Data.name; 67 source=Data.genomeSource; 68 System.out.println("Set Data.GENOME_BUILD to "+Data.GENOME_BUILD); 69 }else if(a.equals("outgenome") || a.equals("outbuild") || a.equals("genomeout") || a.equals("buildout")){ 70 buildout=Integer.parseInt(b); 71 }else if(a.equals("out") || a.equals("outfile")){ 72 dest=b; 73 }else if(a.startsWith("cov") || a.startsWith("pcov") || a.startsWith("perfectcov")){ 74 covfile=b; 75 }else if(a.startsWith("sites") || a.startsWith("psites") || a.startsWith("perfectsites")){ 76 sitesfile=b; 77 }else if(a.equals("padding")){ 78 padding=Integer.parseInt(b); 79 }else if(a.equals("trigger")){ 80 trigger=Integer.parseInt(b); 81 }else if(a.startsWith("mincov")){ 82 mincoverage=Integer.parseInt(b); 83 }else if(a.equals("blocklen")){ 84 blocklen=Integer.parseInt(b); 85 }else if(a.equals("contigfile")){ 86 contigfile=b; 87 }else if(a.equals("verbose")){ 88 verbose=Parse.parseBoolean(b); 89 }else if(a.startsWith("breakbad") || a.startsWith("splitbad") || a.startsWith("splitchim")){ 90 BREAK_BAD_CONTIGS=Parse.parseBoolean(b); 91 }else{ 92 throw new RuntimeException("Unknown parameter: "+args[i]); 93 } 94 } 95 96 assert(Data.GENOME_BUILD>-1); 97 if(buildout<=0){buildout=Data.GENOME_BUILD;} 98 // assert(buildout!=Data.GENOME_BUILD); //For testing 99 100 TextStreamWriter tsw=new TextStreamWriter(dest, false, true, false); 101 tsw.start(); 102 103 //Break into contigs 104 long contig=1; 105 106 if(contigfile!=null){ 107 if(new File(contigfile).exists()){ 108 TextFile tf=new TextFile(contigfile, false); 109 String s=tf.nextLine(); 110 if(s!=null){contig=Long.parseLong(s);} 111 tf.close(); 112 } 113 } 114 115 ArrayList<CoverageArray> calist=null; 116 if(sitesfile!=null){ 117 calist=toCoverage(sitesfile, padding); 118 System.out.println("Made coverage; list size is "+calist.size()); 119 } 120 121 if(buildout==Data.GENOME_BUILD){ 122 String fname=Data.chromFname(1, buildout); 123 fname=fname.replaceFirst("/genome/", "/index/"); 124 fname=fname.substring(0, fname.lastIndexOf('/')); 125 File dir=new File(fname); 126 if(dir.exists()){ 127 System.out.println("Deleting old index."); 128 for(File f2 : dir.listFiles()){ 129 if(f2.isFile() && !f2.isDirectory() && f2.getName().contains(".int2d")){f2.delete();} 130 } 131 } 132 } 133 134 for(int chrom=1; chrom<=Data.numChroms; chrom++){ 135 ChromosomeArray cha=Data.getChromosome(chrom); 136 Data.unload(chrom, true); 137 CoverageArray ca=null; 138 if(calist!=null){ 139 if(calist.size()>chrom){ 140 ca=calist.get(chrom); 141 calist.set(chrom, null); 142 } 143 }else{ 144 assert(covfile!=null && covfile.contains("#")); 145 ca=ReadWrite.read(CoverageArray.class, covfile.replaceFirst("#", ""+chrom), true); 146 if(ca==null){System.out.println("Can't find coverage for chrom "+chrom+" in file "+covfile.replaceFirst("#", ""+chrom));} 147 } 148 if(ca!=null){ 149 contig=writeContigs(cha, ca, contig, trigger, mincoverage, blocklen, tsw, buildout, shared.Tools.max(1, padding)); 150 }else{ 151 System.out.println("Can't find coverage for chrom "+chrom); 152 } 153 } 154 155 156 tsw.poison(); 157 158 if(contigfile!=null){ 159 ReadWrite.writeString(""+contig, contigfile, false); 160 } 161 162 FastaToChromArrays2.writeInfo(buildout, Data.numChroms, name, source, false, false); 163 164 t.stop(); 165 166 System.out.println(" \tWrote \tKept \tDropped \tSplit"); 167 System.out.println("Bases \t"+basesWritten+" \t"+basesKept+" \t"+basesDropped+" \t"+basesX); 168 System.out.println("Contigs \t"+contigsWritten+" \t"+contigsKept+" \t"+contigsDropped+" \t"+contigsX); 169 System.out.println("Avg Len \t"+(basesWritten/Tools.max(contigsWritten,1))+" \t"+(basesKept/Tools.max(contigsKept,1)) 170 +" \t"+(basesDropped/Tools.max(contigsDropped, 1))+" \t"+(basesX/Tools.max(contigsX, 1))); 171 172 System.out.println("Time:\t"+t); 173 } 174 writeContigs(ChromosomeArray cha, CoverageArray ca, long contig, int trigger, int minAcceptableCoverage, int fastaBlocklen, TextStreamWriter tsw, int buildout, int tipbuffer)175 public static long writeContigs(ChromosomeArray cha, CoverageArray ca, long contig, int trigger, int minAcceptableCoverage, int fastaBlocklen, 176 TextStreamWriter tsw, int buildout, int tipbuffer){ 177 178 ArrayList<Range> list=cha.toContigRanges(trigger); 179 180 int minContig=MIN_CONTIG_TO_ADD; 181 182 if(BREAK_BAD_CONTIGS){ 183 for(Range r : list){ 184 if(r.length>=minContig){ 185 // int uncovered=0; 186 // for(int i=r.a; i<=r.b; i++){ 187 // int cov=ca.get(i); 188 // if(cov<minAcceptableCoverage){uncovered++;} 189 // } 190 191 192 //Forward pass 193 int lastx=-1000; 194 int contiglen=0; 195 for(int i=r.a; i<=r.b; i++){ 196 int cov=ca.get(i); 197 if(cov<minAcceptableCoverage){ 198 if(contiglen>=minContig){ 199 byte c=cha.get(i); 200 if(c!='N' && c!='X'){basesX++;} 201 if(i-lastx>10){ 202 contigsX++; 203 } 204 cha.set(i, 'X'); 205 lastx=i; 206 } 207 contiglen=0; 208 }else{ 209 contiglen++; 210 } 211 } 212 213 //Reverse pass 214 lastx=Integer.MAX_VALUE; 215 contiglen=0; 216 for(int i=r.b; i>=r.a; i--){ 217 int cov=ca.get(i); 218 if(cov<minAcceptableCoverage){ 219 if(contiglen>=minContig){ 220 byte c=cha.get(i); 221 if(c!='N' && c!='X'){basesX++;} 222 if(lastx-i>10){ 223 contigsX++; 224 } 225 cha.set(i, 'X'); 226 lastx=i; 227 } 228 contiglen=0; 229 }else{ 230 contiglen++; 231 } 232 } 233 } 234 } 235 list=cha.toContigRanges(trigger); 236 } 237 238 239 ArrayList<Range> good=new ArrayList<Range>(); 240 ArrayList<Range> bad=new ArrayList<Range>(); 241 int badlen=0; 242 243 for(Range r : list){ 244 if(r.length>=minContig){ 245 int minCov=Integer.MAX_VALUE; 246 for(int i=r.a+tipbuffer; i<=r.b-tipbuffer; i++){ 247 minCov=Tools.min(minCov, ca.get(i)); 248 } 249 if(minCov>=minAcceptableCoverage){ 250 good.add(r); 251 if(verbose){ 252 StringBuilder sb0=new StringBuilder(), sb1=new StringBuilder(), sb2=new StringBuilder(); 253 for(int i=r.a; i<=r.b; i++){ 254 int cov=ca.get(i); 255 char b=(char) cha.get(i); 256 sb0.append(b); 257 sb1.append(b+"\t"); 258 sb2.append(cov+"\t"); 259 } 260 System.out.println(sb0+"\n"+sb1+"\n"+sb2+"\n"); 261 } 262 }else{ 263 bad.add(r); 264 badlen+=r.length+N_PAD_LENGTH; 265 if(verbose){ 266 StringBuilder sb0=new StringBuilder(), sb1=new StringBuilder(), sb2=new StringBuilder(); 267 for(int i=r.a; i<=r.b; i++){ 268 int cov=ca.get(i); 269 char b=(char) cha.get(i); 270 sb0.append(b); 271 sb1.append(b+"\t"); 272 sb2.append(cov+"\t"); 273 } 274 System.err.println(sb0+"\n"+sb1+"\n"+sb2+"\n"); 275 } 276 } 277 }else{ 278 contigsDropped++; 279 basesDropped+=r.length; 280 } 281 } 282 283 for(Range r : good){ 284 contigsWritten++; 285 basesWritten+=r.length; 286 String s=cha.getString(r.a, r.b); 287 tsw.print(">"+contig+"\n"); 288 contig++; 289 writeContig(s, tsw, fastaBlocklen); 290 // for(int i=r.a; i<=r.b; i++){cha.set(i, 'N');} //Delete "good" contigs from reference. 291 } 292 293 badlen=badlen+2*N_PAD_LENGTH2-N_PAD_LENGTH+10; 294 ChromosomeArray cha2=new ChromosomeArray(cha.chromosome, cha.strand, 0, badlen); 295 cha2.maxIndex=-1; 296 cha2.minIndex=0; 297 for(int i=0; i<N_PAD_LENGTH2; i++){ 298 cha2.set(i, 'N'); 299 } 300 for(Range r : bad){ 301 contigsKept++; 302 basesKept+=r.length; 303 304 String s=cha.getString(r.a, r.b); 305 for(int i=0; i<s.length(); i++){ 306 cha2.set(cha2.maxIndex+1, s.charAt(i)); 307 } 308 for(int i=0; i<N_PAD_LENGTH; i++){ 309 cha2.set(cha2.maxIndex+1, 'N'); 310 } 311 } 312 for(int i=N_PAD_LENGTH; i<N_PAD_LENGTH2; i++){ 313 cha2.set(cha2.maxIndex+1, 'N'); 314 } 315 316 // ReadWrite.writeObjectInThread(cha2, Data.chromFname(cha2.chromosome, Data.GENOME_BUILD)); 317 String fname=Data.chromFname(cha2.chromosome, buildout); 318 { 319 File f=new File(fname.substring(0, fname.lastIndexOf('/'))); 320 if(!f.exists()){ 321 f.mkdirs(); 322 } 323 } 324 325 ReadWrite.write(cha2, fname, false); 326 327 return contig; 328 } 329 writeContig(CharSequence sb, TextStreamWriter tsw, int blocklen)330 public static void writeContig(CharSequence sb, TextStreamWriter tsw, int blocklen){ 331 for(int i=0; i<sb.length(); i+=blocklen){ 332 int max=Tools.min(i+blocklen, sb.length()); 333 tsw.println(sb.subSequence(i, max)); 334 } 335 } 336 337 toCoverage(String sitesfile, int padding)338 public static ArrayList<CoverageArray> toCoverage(String sitesfile, int padding){ 339 ArrayList<CoverageArray> pcov=new ArrayList<CoverageArray>(8); 340 pcov.add(new CoverageArray2(0,1000)); 341 342 long perfect=0; 343 long semiperfect=0; 344 long sites=0; 345 346 String[] files=sitesfile.split(","); 347 for(String f : files){ 348 TextFile tf=new TextFile(f, false); 349 for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ 350 String[] split=line.split("\t"); 351 for(String s : split){ 352 SiteScore ss=SiteScore.fromText(s); 353 while(pcov.size()<=ss.chrom){ 354 pcov.add(new CoverageArray2(pcov.size(), 500)); 355 } 356 if(ss.perfect || ss.semiperfect){ 357 CoverageArray ca=pcov.get(ss.chrom); 358 for(int i=ss.start+padding; i<=ss.stop-padding; i++){ 359 ca.increment(i); 360 } 361 } 362 if(ss.perfect){perfect++;} 363 if(ss.semiperfect){semiperfect++;} 364 sites++; 365 assert(!ss.perfect || ss.semiperfect) : ss.perfect+", "+ss.semiperfect+"\n"+ss.header()+"\n"+ss.toText()+"\n"+s+"\n"; 366 } 367 } 368 tf.close(); 369 } 370 System.out.println("Read "+files.length+" sites file"+(files.length==1 ? "." : "s.")); 371 System.out.println("sites="+sites+" \tsemiperfect="+semiperfect+" \tperfect="+perfect); 372 return pcov; 373 } 374 375 376 public static long basesWritten=0; 377 public static long basesKept=0; 378 public static long basesDropped=0; 379 public static long basesX=0; 380 public static long contigsWritten=0; 381 public static long contigsKept=0; 382 public static long contigsDropped=0; 383 public static long contigsX=0; 384 385 public static int N_PAD_LENGTH=MergeFastaContigs.N_PAD_LENGTH; 386 public static int N_PAD_LENGTH2=MergeFastaContigs.N_PAD_LENGTH2; //for ends 387 public static int MIN_CONTIG_TO_ADD=50; 388 public static boolean BREAK_BAD_CONTIGS=false; 389 390 public static boolean verbose=false; 391 392 } 393