1 package tax; 2 3 import java.io.File; 4 import java.io.PrintStream; 5 import java.util.ArrayList; 6 import java.util.Arrays; 7 import java.util.Collections; 8 import java.util.HashMap; 9 import java.util.Locale; 10 11 import fileIO.ByteFile; 12 import fileIO.ByteFile1; 13 import fileIO.ByteFile2; 14 import fileIO.ByteStreamWriter; 15 import fileIO.FileFormat; 16 import fileIO.ReadWrite; 17 import fileIO.TextFile; 18 import shared.Parse; 19 import shared.Parser; 20 import shared.PreParser; 21 import shared.Shared; 22 import shared.Timer; 23 import shared.Tools; 24 import stream.ConcurrentGenericReadInputStream; 25 import stream.FastaReadInputStream; 26 import structures.ByteBuilder; 27 import structures.StringNum; 28 29 /** 30 * Counts patterns in Accessions. 31 * Handles hashing for Accession to TaxID lookups. 32 * @author Brian Bushnell 33 * @date May 9, 2018 34 * 35 */ 36 public class AnalyzeAccession_ST { 37 main(String[] args)38 public static void main(String[] args){ 39 //Start a timer immediately upon code entrance. 40 Timer t=new Timer(); 41 42 //Create an instance of this class 43 AnalyzeAccession_ST x=new AnalyzeAccession_ST(args); 44 45 //Run the object 46 x.process(t); 47 48 //Close the print stream if it was redirected 49 Shared.closeStream(x.outstream); 50 } 51 AnalyzeAccession_ST(String[] args)52 public AnalyzeAccession_ST(String[] args){ 53 54 {//Preparse block for help, config files, and outstream 55 PreParser pp=new PreParser(args, getClass(), false); 56 args=pp.args; 57 outstream=pp.outstream; 58 } 59 60 ReadWrite.USE_PIGZ=ReadWrite.USE_UNPIGZ=true; 61 ReadWrite.MAX_ZIP_THREADS=Shared.threads(); 62 63 Parser parser=new Parser(); 64 for(int i=0; i<args.length; i++){ 65 String arg=args[i]; 66 String[] split=arg.split("="); 67 String a=split[0].toLowerCase(); 68 String b=split.length>1 ? split[1] : null; 69 70 if(a.equals("lines")){ 71 maxLines=Long.parseLong(b); 72 if(maxLines<0){maxLines=Long.MAX_VALUE;} 73 }else if(a.equals("verbose")){ 74 verbose=Parse.parseBoolean(b); 75 ByteFile1.verbose=verbose; 76 ByteFile2.verbose=verbose; 77 stream.FastaReadInputStream.verbose=verbose; 78 ConcurrentGenericReadInputStream.verbose=verbose; 79 stream.FastqReadInputStream.verbose=verbose; 80 ReadWrite.verbose=verbose; 81 }else if(a.equals("in")){ 82 if(b==null){in.clear();} 83 else{ 84 String[] split2=b.split(","); 85 for(String s2 : split2){ 86 in.add(s2); 87 } 88 } 89 }else if(b==null && new File(arg).exists()){ 90 in.add(arg); 91 }else if(parser.parse(arg, a, b)){ 92 //do nothing 93 }else{ 94 outstream.println("Unknown parameter "+args[i]); 95 assert(false) : "Unknown parameter "+args[i]; 96 // throw new RuntimeException("Unknown parameter "+args[i]); 97 } 98 } 99 100 {//Process parser fields 101 overwrite=parser.overwrite; 102 append=parser.append; 103 104 out=parser.out1; 105 } 106 107 assert(FastaReadInputStream.settingsOK()); 108 109 if(in==null){throw new RuntimeException("Error - at least one input file is required.");} 110 111 if(!ByteFile.FORCE_MODE_BF2){ 112 ByteFile.FORCE_MODE_BF2=false; 113 ByteFile.FORCE_MODE_BF1=true; 114 } 115 116 if(out!=null && out.equalsIgnoreCase("null")){out=null;} 117 118 if(!Tools.testOutputFiles(overwrite, append, false, out)){ 119 outstream.println((out==null)+", "+out); 120 throw new RuntimeException("\n\noverwrite="+overwrite+"; Can't write to output files "+out+"\n"); 121 } 122 123 ffout=FileFormat.testOutput(out, FileFormat.TXT, null, true, overwrite, append, false); 124 ffina=new FileFormat[in.size()]; 125 for(int i=0; i<in.size(); i++){ 126 ffina[i]=FileFormat.testInput(in.get(i), FileFormat.TXT, null, true, false); 127 } 128 } 129 process(Timer t)130 void process(Timer t){ 131 132 for(FileFormat ffin : ffina){ 133 process_inner(ffin); 134 } 135 136 if(ffout!=null){ 137 ByteStreamWriter bsw=new ByteStreamWriter(ffout); 138 bsw.println("#Pattern\tCount\tCombos\tBits"); 139 ArrayList<StringNum> list=new ArrayList<StringNum>(); 140 list.addAll(countMap.values()); 141 Collections.sort(list); 142 Collections.reverse(list); 143 for(StringNum sn : list){ 144 double combos=1; 145 for(int i=0; i<sn.s.length(); i++){ 146 char c=sn.s.charAt(i); 147 if(c=='D'){combos*=10;} 148 else if(c=='L'){combos*=26;} 149 } 150 bsw.print(sn.toString().getBytes()); 151 bsw.println("\t"+(long)combos+"\t"+String.format(Locale.ROOT, "%.2f", Tools.log2(combos))); 152 } 153 bsw.start(); 154 errorState|=bsw.poisonAndWait(); 155 } 156 157 t.stop(); 158 159 outstream.println(Tools.timeLinesBytesProcessed(t, linesProcessed, bytesProcessed, 8)); 160 161 outstream.println(); 162 outstream.println("Valid Lines: \t"+linesOut); 163 outstream.println("Invalid Lines: \t"+(linesProcessed-linesOut)); 164 165 if(errorState){ 166 throw new RuntimeException(getClass().getName()+" terminated in an error state; the output may be corrupt."); 167 } 168 } 169 process_inner(FileFormat ffin)170 void process_inner(FileFormat ffin){ 171 172 ByteFile bf=ByteFile.makeByteFile(ffin); 173 174 byte[] line=bf.nextLine(); 175 StringBuilder buffer=new StringBuilder(32); 176 177 for(int lineNum=0; line!=null; lineNum++){ 178 if(line.length>0){ 179 if(maxLines>0 && linesProcessed>=maxLines){break;} 180 linesProcessed++; 181 bytesProcessed+=(line.length+1); 182 183 assert((lineNum==0)==(Tools.startsWith(line, "accession"))) : "Line "+lineNum+": "+new String(line); 184 // final boolean valid=(line[0]!='#'); 185 186 if(true){ 187 linesOut++; 188 bytesOut+=(line.length+1); 189 increment(line, buffer); 190 } 191 } 192 line=bf.nextLine(); 193 } 194 195 errorState|=bf.close(); 196 } 197 increment(byte[] line, StringBuilder buffer)198 void increment(byte[] line, StringBuilder buffer){ 199 buffer.setLength(0); 200 for(int i=0; i<line.length; i++){ 201 final byte b=line[i]; 202 if(b==' ' || b=='\t' || b=='.'){break;} 203 buffer.append((char)remap[b]); 204 } 205 String key=buffer.toString(); 206 StringNum value=countMap.get(key); 207 if(value!=null){value.increment();} 208 else{countMap.put(key, new StringNum(key, 1));} 209 } 210 combos(String s)211 public static long combos(String s){ 212 double combos=1; 213 for(int i=0; i<s.length(); i++){ 214 char c=s.charAt(i); 215 if(c=='D'){combos*=10;} 216 else if(c=='L'){combos*=26;} 217 } 218 return (combos>=Long.MAX_VALUE ? Long.MAX_VALUE : (long)Math.ceil(combos)); 219 } 220 combos(byte[] s)221 public static long combos(byte[] s){ 222 double combos=1; 223 for(int i=0; i<s.length; i++){ 224 byte c=s[i]; 225 if(c=='D'){combos*=10;} 226 else if(c=='L'){combos*=26;} 227 } 228 return (combos>=Long.MAX_VALUE ? -1 : (long)Math.ceil(combos)); 229 } 230 231 /*--------------------------------------------------------------*/ 232 loadCodeMap(String fname)233 public static HashMap<String, Integer> loadCodeMap(String fname){ 234 assert(codeMap==null); 235 TextFile tf=new TextFile(fname); 236 ArrayList<String> list=new ArrayList<String>(); 237 for(String line=tf.nextLine(); line!=null; line=tf.nextLine()){ 238 if(!line.startsWith("#")){ 239 String[] split=line.split("\t"); 240 list.add(split[0]); 241 } 242 } 243 HashMap<String, Integer> map=new HashMap<String, Integer>(list.size()*3); 244 codeBits=(int)Math.ceil(Tools.log2(list.size())); 245 final int patternBits=63-codeBits; 246 final long maxCombos=((1L<<(patternBits-1))-1); 247 for(int i=0; i<list.size(); i++){ 248 String s=list.get(i); 249 longestPattern=Tools.max(longestPattern, s.length()); 250 long combos=combos(s); 251 if(combos<0 || combos>=maxCombos){map.put(s, -1);} 252 else{map.put(s, i);} 253 } 254 codeMap=map; 255 return map; 256 } 257 digitize(String s)258 public static long digitize(String s){ 259 String pattern=remap(s); 260 Integer code=codeMap.get(pattern); 261 if(code==null){return -2;} 262 if(code.intValue()<0){return -1;} 263 264 long number=0; 265 for(int i=0; i<pattern.length(); i++){ 266 char c=s.charAt(i); 267 char p=pattern.charAt(i); 268 if(p=='-'){ 269 //do nothing 270 }else if(p=='D'){ 271 number=(number*10)+(c-'0'); 272 }else if(p=='L'){ 273 number=(number*26)+(Tools.toUpperCase(c)-'A'); 274 }else{ 275 assert(false) : s; 276 } 277 } 278 number=(number<<codeBits)+code; 279 return number; 280 } 281 digitize(byte[] s)282 public static long digitize(byte[] s){ 283 String pattern=remap(s); 284 Integer code=codeMap.get(pattern); 285 if(code==null){return -2;} 286 if(code.intValue()<0){return -1;} 287 288 long number=0; 289 for(int i=0; i<pattern.length(); i++){ 290 byte c=s[i]; 291 char p=pattern.charAt(i); 292 if(p=='-'){ 293 //do nothing 294 }else if(p=='D'){ 295 number=(number*10)+(c-'0'); 296 }else if(p=='L'){ 297 number=(number*26)+(Tools.toUpperCase(c)-'A'); 298 }else{ 299 assert(false) : s; 300 } 301 } 302 number=(number<<codeBits)+code; 303 return number; 304 } 305 remap(String s)306 public static String remap(String s){ 307 ByteBuilder buffer=new ByteBuilder(s.length()); 308 for(int i=0; i<s.length(); i++){ 309 final char b=s.charAt(i); 310 if(b==' ' || b=='\t' || b=='.'){break;} 311 buffer.append((char)remap[b]); 312 } 313 return buffer.toString(); 314 } 315 remap(byte[] s)316 public static String remap(byte[] s){ 317 ByteBuilder buffer=new ByteBuilder(s.length); 318 for(int i=0; i<s.length; i++){ 319 final byte b=s[i]; 320 if(b==' ' || b=='\t' || b=='.'){break;} 321 buffer.append((char)remap[b]); 322 } 323 return buffer.toString(); 324 } 325 326 /*--------------------------------------------------------------*/ 327 328 private ArrayList<String> in=new ArrayList<String>(); 329 private String out=null; 330 331 /*--------------------------------------------------------------*/ 332 333 private HashMap<String, StringNum> countMap=new HashMap<String, StringNum>(); 334 public static HashMap<String, Integer> codeMap; 335 private static int codeBits=-1; 336 private static int longestPattern=-1; 337 338 private long linesProcessed=0; 339 private long linesOut=0; 340 private long bytesProcessed=0; 341 private long bytesOut=0; 342 343 private long maxLines=Long.MAX_VALUE; 344 345 /*--------------------------------------------------------------*/ 346 347 private final FileFormat[] ffina; 348 private final FileFormat ffout; 349 350 private static final byte[] remap=makeRemap(); 351 makeRemap()352 private static byte[] makeRemap(){ 353 byte[] array=new byte[128]; 354 Arrays.fill(array, (byte)'?'); 355 for(int i='A'; i<='Z'; i++){array[i]='L';} 356 for(int i='a'; i<='z'; i++){array[i]='L';} 357 for(int i='0'; i<='9'; i++){array[i]='D';} 358 array['_']=array['-']='-'; 359 return array; 360 } 361 362 /*--------------------------------------------------------------*/ 363 364 private PrintStream outstream=System.err; 365 public static boolean verbose=false; 366 public boolean errorState=false; 367 private boolean overwrite=false; 368 private boolean append=false; 369 370 } 371