1 package prok; 2 3 import java.io.File; 4 5 import dna.AminoAcid; 6 import dna.Data; 7 import fileIO.FileFormat; 8 import fileIO.ReadWrite; 9 import shared.Parse; 10 import shared.Tools; 11 import stream.ConcurrentReadInputStream; 12 import stream.Read; 13 import stream.ReadInputStream; 14 import structures.ListNum; 15 import structures.LongHashSet; 16 17 /** Contains a lot of statics and static methods for gene-calling */ 18 public abstract class ProkObject { 19 parse(String arg, String a, String b)20 public static boolean parse(String arg, String a, String b){ 21 if(a.equalsIgnoreCase("16sstartslop") || a.equalsIgnoreCase("ssustartslop")){ 22 ssuStartSlop=Integer.parseInt(b); 23 }else if(a.equalsIgnoreCase("23sstartslop") || a.equalsIgnoreCase("lsustartslop")){ 24 lsuStartSlop=Integer.parseInt(b); 25 }else if(a.equalsIgnoreCase("5sstartslop")){ 26 r5SStartSlop=Integer.parseInt(b); 27 }else if(a.equalsIgnoreCase("16sstopslop") || a.equalsIgnoreCase("ssustopslop")){ 28 ssuStopSlop=Integer.parseInt(b); 29 }else if(a.equalsIgnoreCase("23sstopslop") || a.equalsIgnoreCase("lsustopslop")){ 30 lsuStopSlop=Integer.parseInt(b); 31 }else if(a.equalsIgnoreCase("5sstopslop")){ 32 r5SStopSlop=Integer.parseInt(b); 33 }else if(a.equals("plus")){ 34 PROCESS_PLUS_STRAND=Parse.parseBoolean(b); 35 }else if(a.equals("minus")){ 36 PROCESS_MINUS_STRAND=Parse.parseBoolean(b); 37 } 38 39 else if(a.equalsIgnoreCase("min16SIdentity") || a.equalsIgnoreCase("min16SId")) { 40 min16SIdentity=Float.parseFloat(b); 41 }else if(a.equalsIgnoreCase("min18SIdentity") || a.equalsIgnoreCase("min18SId")) { 42 min18SIdentity=Float.parseFloat(b); 43 }else if(a.equalsIgnoreCase("min23SIdentity") || a.equalsIgnoreCase("min23SId")) { 44 min23SIdentity=Float.parseFloat(b); 45 }else if(a.equalsIgnoreCase("min5SIdentity") || a.equalsIgnoreCase("min5SId")) { 46 min5SIdentity=Float.parseFloat(b); 47 } 48 49 else if(a.equalsIgnoreCase("align16s") || a.equalsIgnoreCase("load16SSequence")){ 50 load16SSequence=Parse.parseBoolean(b); 51 }else if(a.equalsIgnoreCase("align23s") || a.equalsIgnoreCase("load23SSequence")){ 52 load23SSequence=Parse.parseBoolean(b); 53 }else if(a.equalsIgnoreCase("align18s") || a.equalsIgnoreCase("load18SSequence")){ 54 load18SSequence=Parse.parseBoolean(b); 55 }else if(a.equalsIgnoreCase("align5s") || a.equalsIgnoreCase("load5SSequence")){ 56 load5SSequence=Parse.parseBoolean(b); 57 } 58 59 else if(a.equalsIgnoreCase("load16skmers") || a.equalsIgnoreCase("load18skmers") || a.equalsIgnoreCase("loadssukmers")){ 60 loadSSUkmers=Parse.parseBoolean(b); 61 }else if(a.equalsIgnoreCase("load23skmers") || a.equalsIgnoreCase("load28skmers") || a.equalsIgnoreCase("loadlsukmers")){ 62 loadLSUkmers=Parse.parseBoolean(b); 63 }else if(a.equalsIgnoreCase("load5skmers")){ 64 load5Skmers=Parse.parseBoolean(b); 65 }else if(a.equalsIgnoreCase("loadtrnakmers")){ 66 loadtRNAkmers=Parse.parseBoolean(b); 67 }else if(a.equalsIgnoreCase("klongtrna")){ 68 kLongTRna=Integer.parseInt(b); 69 }else if(a.equalsIgnoreCase("longkmers")){ 70 loadSSUkmers=loadLSUkmers=load5Skmers=loadtRNAkmers=Parse.parseBoolean(b); 71 }else if(a.equalsIgnoreCase("klong5s")){ 72 kLong5S=Integer.parseInt(b); 73 }else if(a.equalsIgnoreCase("klong16s") || a.equalsIgnoreCase("klong18s") || a.equalsIgnoreCase("klongssu")){ 74 kLongSSU=Integer.parseInt(b); 75 }else if(a.equalsIgnoreCase("klong23s") || a.equalsIgnoreCase("klong28s") || a.equalsIgnoreCase("klonglsu")){ 76 kLongLSU=Integer.parseInt(b); 77 }else if(a.equalsIgnoreCase("klongtrna")){ 78 kLongTRna=Integer.parseInt(b); 79 } 80 81 else{ 82 return false; 83 } 84 return true; 85 } 86 87 /*--------------------------------------------------------------*/ 88 processType(int type)89 public static boolean processType(int type){ 90 return (type==CDS ? callCDS : type==r16S ? call16S : type==r23S ? call23S : type==r18S ? call18S : type==r5S ? call5S : type==tRNA ? calltRNA : true); 91 } 92 startSlop(int type)93 public static int startSlop(int type) { 94 int slop=(type==r16S ? ssuStartSlop : type==r23S ? lsuStartSlop : type==r18S ? ssuStartSlop : type==r5S ? r5SStartSlop : 9999); 95 return slop; 96 } 97 stopSlop(int type)98 public static int stopSlop(int type) { 99 int slop=(type==r16S ? ssuStopSlop : type==r23S ? lsuStopSlop : type==r18S ? ssuStopSlop : type==r5S ? r5SStopSlop : 9999); 100 return slop; 101 } 102 minID(int type)103 public static float minID(int type) { 104 float minIdentity=(type==r16S ? min16SIdentity : type==r23S ? min23SIdentity : type==r18S ? min18SIdentity : type==r5S ? min5SIdentity : 0); 105 return minIdentity; 106 } 107 consensusReads(int type)108 public static Read[] consensusReads(int type) { 109 Read[] consensusReads=(type==r16S ? r16SSequence : type==r23S ? r23SSequence : type==r18S ? r18SSequence : type==r5S ? r5SSequence : null); 110 return consensusReads; 111 } 112 kmerSet(int type)113 public static LongHashSet kmerSet(int type) { 114 LongHashSet set=(type==tRNA ? trnaKmers : type==r16S ? ssuKmers : type==r23S ? lsuKmers : type==r5S ? r5SKmers : type==r18S ? ssuKmers : null); 115 return set; 116 } 117 kLongLen(int type)118 public static int kLongLen(int type) { 119 int kLongLen=(type==tRNA ? kLongTRna : type==r16S ? kLongSSU : type==r23S ? kLongLSU : type==r5S ? kLong5S : type==r18S ? kLongSSU : -1); 120 return kLongLen; 121 } 122 flagToType(int flag)123 public static int flagToType(int flag) { 124 return Integer.numberOfTrailingZeros(flag)+1; 125 } 126 typeToFlag(int type)127 public static byte typeToFlag(int type) { 128 assert(type<=6); 129 return (byte)(1<<(type-1)); 130 } 131 callType(int type)132 public static boolean callType(int type){//TODO: Turn these functions into array lookups 133 if(type==CDS){return callCDS;} 134 else if(type==tRNA){return calltRNA;} 135 else if(type==r16S){return call16S;} 136 else if(type==r23S){return call23S;} 137 else if(type==r5S){return call5S;} 138 else if(type==r18S){return call18S;} 139 assert(false) : type; 140 return false; 141 } 142 143 /*--------------------------------------------------------------*/ 144 /*---------------- Long Kmers ----------------*/ 145 /*--------------------------------------------------------------*/ 146 loadLongKmers()147 public static synchronized void loadLongKmers(){ 148 // assert(ssuKmers==null); 149 // assert(false) : load5Skmers+", "+kLong5s; 150 if(loadedLongKmers){return;} 151 if(loadSSUkmers){ssuKmers=loadLongKmersByType(kLongSSU, "ssu");} 152 if(loadLSUkmers){lsuKmers=loadLongKmersByType(kLongLSU, "lsu");} 153 if(load5Skmers){r5SKmers=loadLongKmersByType(kLong5S, "5S");} 154 if(loadtRNAkmers){trnaKmers=loadLongKmersByType(kLongTRna, "tRNA");} 155 loadedLongKmers=true; 156 } 157 158 // private static LongHashSet loadLongKmers(StatsContainer sc, int k, String prefix){ 159 // String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa"); 160 // if(!new File(fname).exists()){ 161 // fname=fname+".gz"; 162 // if(!new File(fname).exists()){ 163 // System.err.println("Can't find "+fname); 164 // return null; 165 // } 166 // } 167 // LongHashSet set=loadLongKmers(fname, k); 168 // sc.kmerSet=set; 169 // sc.kLongLen=k; 170 // return set; 171 // } 172 loadLongKmersByType(int k, String prefix)173 private static LongHashSet loadLongKmersByType(int k, String prefix){ 174 String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa", true); 175 if(!new File(fname).exists()){ 176 fname=fname+".gz"; 177 if(!new File(fname).exists()){ 178 System.err.println("Can't find "+fname); 179 return null; 180 } 181 } 182 LongHashSet set=loadLongKmers(fname, k); 183 return set; 184 } 185 loadLongKmers(String fname, int k)186 private static LongHashSet loadLongKmers(String fname, int k){//TODO: Consider making this a LongHashSet. No reason not to... 187 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false); 188 ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null); 189 cris.start(); //Start the stream 190 // if(verbose){outstream.println("Started cris");} 191 192 LongHashSet set=new LongHashSet(1000); 193 ListNum<Read> ln=cris.nextList(); 194 while(ln!=null && ln.size()>0){ 195 processList(ln, set, k); 196 cris.returnList(ln); 197 ln=cris.nextList(); 198 } 199 if(ln!=null){cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());} 200 ReadWrite.closeStream(cris); 201 return set; 202 } 203 processList(ListNum<Read> ln, LongHashSet set, int k)204 private static LongHashSet processList(ListNum<Read> ln, LongHashSet set, int k){ 205 final long mask=~((-1L)<<(2*k)); 206 for(Read r : ln){ 207 final byte[] bases=r.bases; 208 long kmer=0; 209 int len=0; 210 for(byte b : bases){ 211 final int num=AminoAcid.baseToNumber[b]; 212 if(num>=0){ 213 len++; 214 kmer=((kmer<<2)|num)&mask; 215 if(len>=k){ 216 set.add(kmer); 217 } 218 }else{ 219 len=0; 220 } 221 } 222 } 223 return set; 224 } 225 226 /*--------------------------------------------------------------*/ 227 /*---------------- Consensus Sequence ----------------*/ 228 /*--------------------------------------------------------------*/ 229 loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro)230 public static synchronized void loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro){ 231 if(loadedConsensusSequence){return;} 232 // assert(r16SSequence==null); 233 if(load16SSequence){r16SSequence=loadConsensusSequenceType("16S", removeMito, removeChloro);} 234 if(load18SSequence){r18SSequence=loadConsensusSequenceType("18S", removeMito, removeChloro);} 235 if(load23SSequence){r23SSequence=loadConsensusSequenceType("23S", removeMito, removeChloro);} 236 if(load5SSequence){r5SSequence=loadConsensusSequenceType("5S", removeMito, removeChloro);} 237 if(loadtRNASequence){trnaSequence=loadConsensusSequenceType("tRNA", removeMito, removeChloro);} 238 loadedConsensusSequence=true; 239 } 240 loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro)241 public static Read[] loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro){ 242 String fname=null; 243 fname=Data.findPath("?"+prefix+"_consensus_sequence.fq", false); 244 if(fname!=null && (fname.endsWith(".jar") || new File(fname).exists())){ 245 fname=Tools.fixExtension(fname); 246 }else{ 247 fname=Data.findPath("?"+prefix+"_consensus_sequence.fa", true); 248 fname=Tools.fixExtension(fname); 249 if(!fname.endsWith(".jar") && !new File(fname).exists()){ 250 System.err.println("Can't find "+fname); 251 return null; 252 } 253 } 254 Read[] array=loadConsensusSequence(fname); 255 if(removeMito){array=stripOrganelle(array, "mito");} 256 if(removeChloro){array=stripOrganelle(array, "plastid");} 257 return array; 258 } 259 loadConsensusSequence(String fname)260 private static Read[] loadConsensusSequence(String fname){ 261 FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false); 262 Read[] array=ReadInputStream.toReadArray(ff, -1); 263 return array; 264 } 265 stripOrganelle(Read[] array, String key)266 private static Read[] stripOrganelle(Read[] array, String key){ 267 int removed=0; 268 for(int j=0; j<array.length; j++){ 269 if(array[j].id.toLowerCase().startsWith(key)) { 270 array[j]=null; 271 removed++; 272 } 273 } 274 if(removed>0){array=Tools.condenseStrict(array);} 275 return array; 276 } 277 278 /*--------------------------------------------------------------*/ 279 280 public static final int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7; 281 public static String[] typeStrings=new String[] {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"}; 282 public static String[] typeStrings2=new String[] {"CDS", "tRNA", "rRNA", "rRNA", "rRNA", "rRNA", "rRNA", "RNA"}; 283 public static String[] specialTypeStrings=new String[] {null, "tRNA", "16S", "23S", "5S", "18S", "28S", null}; isSpecialType(String type)284 public static boolean isSpecialType(String type){ 285 if(type==null){return false;} 286 for(String s : specialTypeStrings){ 287 if(type.equalsIgnoreCase(s)){return true;} 288 } 289 return false; 290 } 291 292 public static int kInnerRNA=6; 293 public static int kStartRNA=3; 294 public static int kStopRNA=3; 295 296 public static int kLongSSU=15; 297 public static int kLongLSU=15; 298 public static int kLong5S=15; 299 public static int kLongTRna=15; 300 301 public static float min16SIdentity=0.62f; 302 public static float min23SIdentity=0.60f; 303 public static float min5SIdentity=0.60f; 304 public static float min18SIdentity=0.60f; 305 306 static int ssuStartSlop=200; 307 static int ssuStopSlop=0; 308 static int lsuStartSlop=220; 309 static int lsuStopSlop=0; 310 static int r5SStartSlop=50; 311 static int r5SStopSlop=50; 312 313 public static boolean callCDS=true; 314 public static boolean calltRNA=true; 315 public static boolean call16S=true; 316 public static boolean call23S=true; 317 public static boolean call5S=true; 318 public static boolean call18S=false; 319 320 public static LongHashSet ssuKmers=null; 321 public static LongHashSet lsuKmers=null; 322 public static LongHashSet r5SKmers=null; 323 public static LongHashSet trnaKmers=null; 324 325 public static Read[] trnaSequence=null; 326 public static Read[] r16SSequence=null; 327 public static Read[] r23SSequence=null; 328 public static Read[] r5SSequence=null; 329 public static Read[] r18SSequence=null; 330 331 public static boolean PROCESS_PLUS_STRAND=true; 332 public static boolean PROCESS_MINUS_STRAND=true; 333 334 public static boolean loadSSUkmers=true; 335 public static boolean loadLSUkmers=true; 336 public static boolean load5Skmers=true; 337 public static boolean loadtRNAkmers=true; 338 private static boolean loadedLongKmers=false; 339 340 public static boolean loadtRNASequence=false; 341 public static boolean load16SSequence=true; 342 public static boolean load23SSequence=true; 343 public static boolean load5SSequence=true; 344 public static boolean load18SSequence=true; 345 private static boolean loadedConsensusSequence=false; 346 347 } 348