1 package tax; 2 3 import java.io.File; 4 import java.util.ArrayList; 5 6 import fileIO.ByteFile; 7 import fileIO.ReadWrite; 8 import shared.Parse; 9 import shared.Shared; 10 import shared.Tools; 11 import structures.IntList; 12 13 /** 14 * @author Brian Bushnell 15 * @date Mar 10, 2015 16 * 17 */ 18 public class GiToTaxid { 19 main(String[] args)20 public static void main(String[] args){ 21 ReadWrite.USE_UNPIGZ=true; 22 ReadWrite.USE_PIGZ=true; 23 ReadWrite.ZIPLEVEL=9; 24 ReadWrite.PIGZ_BLOCKSIZE=256; 25 // ReadWrite.PIGZ_ITERATIONS=30; 26 27 for(String arg : args){ 28 String[] split=arg.split("="); 29 String a=split[0].toLowerCase(); 30 String b=split.length>1 ? split[1] : null; 31 shared.Parser.parseZip(arg, a, b); 32 } 33 // if(args.length>2 && false){//Run a test 34 // test(args); 35 // }else 36 if(args.length>=2){//Write array 37 initialize(args[0]); 38 ReadWrite.write(array, args[1], true); 39 } 40 } 41 test(String[] args)42 public static void test(String[] args){ 43 System.err.println(getID(1000)); 44 System.err.println(getID(10000)); 45 System.err.println(getID(10001)); 46 System.err.println(getID(10002)); 47 System.err.println(getID(10003)); 48 System.err.println(getID(10004)); 49 System.err.println(getID(10005)); 50 System.err.println(getID(100000)); 51 System.err.println(getID(1000000)); 52 System.err.println(getID(10000000)); 53 54 TaxTree tree=null; 55 if(args.length>1){ 56 tree=TaxTree.loadTaxTree(args[0], System.err, true, true); 57 } 58 59 System.err.println("Strings:"); 60 int x; 61 x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44"); 62 System.err.println(x); 63 if(tree!=null){ 64 System.err.println(tree.getNode(x)); 65 tree.incrementRaw(x, 30); 66 } 67 x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds"); 68 System.err.println(x); 69 if(tree!=null){ 70 System.err.println(tree.getNode(x)); 71 tree.incrementRaw(x, 40); 72 } 73 x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence"); 74 System.err.println(x); 75 if(tree!=null){ 76 System.err.println(tree.getNode(x)); 77 tree.incrementRaw(x, 20); 78 } 79 80 if(tree!=null){ 81 tree.percolateUp(); 82 ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35); 83 for(TaxNode n : nodes){ 84 System.err.println(n); 85 } 86 } 87 } 88 parseGiToTaxid(String s)89 public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');} parseGiToTaxid(String s, char delimiter)90 public static int parseGiToTaxid(String s, char delimiter){ 91 int x=parseGiNumber(s, delimiter); 92 assert(x>=0) : s; 93 assert(array!=null) : "To use gi numbers, you must load a gi table."; 94 // if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty. 95 if(x>=0 && x<array.length){return array[x];} 96 assert(x<array.length) : "The GI number "+x+" is too big.\n" 97 + "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n" 98 + "To ignore this problem, please run with the -da flag.\n"; 99 return -1; 100 } 101 102 103 public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');} 104 public static int parseGiToTaxid(byte[] s, char delimiter){ 105 int x=parseGiNumber(s, delimiter); 106 if(x>=0){return array[x];} 107 return -1; 108 } 109 110 /** Parse a gi number, or return -1 if formatted incorrectly. */ parseGiNumber(String s, char delimiter)111 static int parseGiNumber(String s, char delimiter){ 112 if(s==null || s.length()<4){return -1;} 113 // System.err.println("a"); 114 if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);} 115 // System.err.println("b"); 116 if(!s.startsWith("gi")){return -1;} 117 // System.err.println("c"); 118 // System.err.println("d"); 119 int initial=s.indexOf(delimiter); 120 // System.err.println("e"); 121 if(initial<0){ 122 if(delimiter!='~'){ 123 delimiter='~'; 124 initial=s.indexOf(delimiter); 125 } 126 if(initial<0){ 127 delimiter='_'; 128 initial=s.indexOf(delimiter); 129 } 130 if(initial<0){return -1;} 131 // System.err.println("f"); 132 // System.err.println("g"); 133 } 134 // System.err.println("h"); 135 if(!Tools.isDigit(s.charAt(initial+1))){return -1;} 136 // System.err.println("i"); 137 138 int number=0; 139 for(int i=initial+1; i<s.length(); i++){ 140 char c=s.charAt(i); 141 if(c==delimiter){break;} 142 assert(Tools.isDigit(c)); 143 number=(number*10)+(c-'0'); 144 } 145 // System.err.println("j: "+number); 146 return number; 147 } 148 149 /** Parse a ncbi number, or return -1 if formatted incorrectly. */ parseTaxidNumber(String s, char delimiter)150 public static int parseTaxidNumber(String s, char delimiter){ 151 if(s==null || s.length()<5){return -1;} 152 if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);} 153 if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;} 154 int initial=s.indexOf(delimiter); 155 if(initial<0){ 156 delimiter='_'; 157 initial=s.indexOf(delimiter); 158 if(initial<0){return -1;} 159 } 160 if(!Tools.isDigit(s.charAt(initial+1))){return -1;} 161 162 int number=0; 163 for(int i=initial+1; i<s.length(); i++){ 164 char c=s.charAt(i); 165 if(c==delimiter || c==' '){break;} 166 assert(Tools.isDigit(c)) : c+"\n"+s; 167 number=(number*10)+(c-'0'); 168 } 169 return number; 170 } 171 172 getID(String s)173 public static int getID(String s){return getID(s, '|');} 174 /** Get the taxID from a header starting with a taxID or gi number */ getID(String s, char delimiter)175 public static int getID(String s, char delimiter){ 176 int x=parseTaxidNumber(s, delimiter); 177 if(x>=0){return x;} 178 x=parseGiNumber(s, delimiter); 179 if(x>=0){return array[x];} 180 return -1; 181 } 182 183 /** Parse a gi number, or return -1 if formatted incorrectly. */ parseGiNumber(byte[] s, char delimiter)184 static int parseGiNumber(byte[] s, char delimiter){ 185 if(s==null || s.length<4){return -1;} 186 if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;} 187 int initial=Tools.indexOf(s, (byte)delimiter); 188 if(initial<0){ 189 delimiter='_'; 190 initial=Tools.indexOf(s, (byte)delimiter); 191 if(initial<0){return -1;} 192 } 193 if(!Tools.isDigit(s[initial+1])){return -1;} 194 195 int number=0; 196 for(int i=initial+1; i<s.length; i++){ 197 byte c=s[i]; 198 if(c==delimiter){break;} 199 assert(Tools.isDigit(c)); 200 number=(number*10)+(c-'0'); 201 } 202 return number; 203 } 204 205 /** Parse a gi number, or return -1 if formatted incorrectly. */ parseNcbiNumber(byte[] s, char delimiter)206 static int parseNcbiNumber(byte[] s, char delimiter){ 207 if(s==null || s.length<3){return -1;} 208 if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;} 209 int initial=Tools.indexOf(s, (byte)delimiter); 210 if(initial<0){ 211 delimiter='_'; 212 initial=Tools.indexOf(s, (byte)delimiter); 213 if(initial<0){return -1;} 214 } 215 if(!Tools.isDigit(s[initial+1])){return -1;} 216 217 int number=0; 218 for(int i=initial+1; i<s.length; i++){ 219 byte c=s[i]; 220 if(c==delimiter){break;} 221 assert(Tools.isDigit(c)); 222 number=(number*10)+(c-'0'); 223 } 224 return number; 225 } 226 getID(byte[] s)227 public static int getID(byte[] s){return getID(s, '|');} 228 /** Get the taxID from a header starting with a taxID or gi number */ getID(byte[] s, char delimiter)229 public static int getID(byte[] s, char delimiter){ 230 int x=parseGiNumber(s, delimiter); 231 if(x>=0){return array[x];} 232 return parseNcbiNumber(s, delimiter); 233 } 234 235 /** Get the taxID from a gi number */ getID(int gi)236 public static int getID(int gi){ 237 assert(gi>=0) : gi; 238 assert(gi<array.length) : gi+", "+array.length; 239 return array[gi]; 240 } 241 242 public static void initialize(String fname){ 243 assert(fname!=null); 244 if(fileString==null || !fileString.equals(fname)){ 245 synchronized(GiToTaxid.class){ 246 if(!initialized || fileString==null || !fileString.equals(fname)){ 247 fileString=fname; 248 if(fname.contains(".int1d")){ 249 array=ReadWrite.read(int[].class, fname, true); 250 }else{ 251 array=makeArray(fname); 252 } 253 } 254 initialized=true; 255 } 256 } 257 } 258 259 public static boolean isInitialized(){return initialized;} 260 261 public static synchronized void unload(){ 262 array=null; 263 fileString=null; 264 initialized=false; 265 } 266 267 private static int[] makeArray(String fnames){ 268 String[] split; 269 if(new File(fnames).exists()){split=new String[] {fnames};} 270 else if(fnames.indexOf(',')>=0){split=fnames.split(",");} 271 else if(fnames.indexOf('#')>=0){ 272 assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory."; 273 File dir=new File(System.getProperty("user.dir")); 274 String prefix=fnames.substring(0, fnames.indexOf('#')); 275 String suffix=fnames.substring(fnames.indexOf('#')+1); 276 277 File[] array=dir.listFiles(); 278 StringBuilder sb=new StringBuilder(); 279 String comma=""; 280 for(File f : array){ 281 String s=f.getName(); 282 if(s.startsWith(prefix) && s.startsWith(suffix)){ 283 sb.append(comma); 284 sb.append(s); 285 comma=","; 286 } 287 } 288 split=sb.toString().split(","); 289 }else{ 290 throw new RuntimeException("Invalid file: "+fnames); 291 } 292 293 IntList list=new IntList(); 294 // assert(max<Integer.MAX_VALUE) : "Overflow."; 295 // int[] x=new int[(int)max+1]; 296 // Arrays.fill(x, -1); 297 298 long total=0; 299 for(String s : split){ 300 long count=addToList(s, list); 301 total+=count; 302 } 303 return list.shrink().array; 304 } 305 306 private static long addToList(String fname, IntList list){ 307 boolean warned=false; 308 ByteFile bf=ByteFile.makeByteFile(fname, true); 309 long count=0, invalid=0; 310 byte[] line=bf.nextLine(); 311 while(line!=null){ Tools.isDigit(line[line.length-1])312 if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na 313 count++; 314 int tab2=Tools.indexOfNth(line, '\t', 2); 315 int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1); 316 assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length; 317 assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'"; 318 assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'"; 319 // assert(false) : tab2+", "+tab3+", '"+new String(line)+"'"; 320 int tid=Parse.parseInt(line, tab2+1, tab3); 321 int gi=Parse.parseInt(line, tab3+1, line.length); 322 if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021. 323 invalid++; 324 }else{ 325 assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'"; 326 int old=list.get(gi); 327 assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3; 328 329 list.set(gi, tid); 330 331 //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi; 332 // if(x[gi]!=-1 && x[gi]!=ncbi){ 333 // if(!warned){ 334 // System.err.println("***WARNING*** For file "+fname+":\n"+ 335 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ 336 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" 337 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" 338 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); 339 // warned=true; 340 // } 341 // }else{ 342 // x[gi]=ncbi; 343 // } 344 } 345 }else{ 346 if(line.length==0){System.err.println(fname+", "+count);}//debug 347 invalid++; 348 } 349 line=bf.nextLine(); 350 } 351 if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);} 352 bf.close(); 353 return count; 354 } 355 356 // private static int[] makeArrayOld(String fnames){ 357 // String[] split; 358 // if(new File(fnames).exists()){split=new String[] {fnames};} 359 // else{split=fnames.split(",");} 360 // 361 // long max=0; 362 // for(String s : split){ 363 // max=Tools.max(max, findMaxID(s)); 364 // } 365 // 366 // assert(max<Integer.MAX_VALUE) : "Overflow."; 367 // int[] x=new int[(int)max+1]; 368 // Arrays.fill(x, -1); 369 // 370 // long total=0; 371 // for(String s : split){ 372 // long count=fillArray(s, x); 373 // total+=count; 374 // } 375 // return x; 376 // } 377 // 378 // private static long findMaxID(String fname){ 379 // ByteFile bf=ByteFile.makeByteFile(fname, true); 380 // long count=0, max=0; 381 // byte[] line=bf.nextLine(); 382 // while(line!=null){ 383 // count++; 384 // int tab=Tools.indexOf(line, (byte)'\t'); 385 // long gi=Parse.parseLong(line, 0, tab); 386 // max=Tools.max(max, gi); 387 // line=bf.nextLine(); 388 // } 389 // bf.close(); 390 // return max; 391 // } 392 // 393 // private static long fillArray(String fname, int[] x){ 394 // boolean warned=false; 395 // ByteFile bf=ByteFile.makeByteFile(fname, true); 396 // long count=0; 397 // byte[] line=bf.nextLine(); 398 // while(line!=null){ 399 // count++; 400 // int tab=Tools.indexOf(line, (byte)'\t'); 401 // int gi=Parse.parseInt(line, 0, tab); 402 // int ncbi=Parse.parseInt(line, tab+1, line.length); 403 // //assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi; 404 // if(x[gi]!=-1 && x[gi]!=ncbi){ 405 // if(!warned){ 406 // System.err.println("***WARNING*** For file "+fname+":\n"+ 407 // ("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+ 408 // "\nThis may be an error from NCBI and you may wish to report it, but it is\n" 409 // + "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n" 410 // + "at least between nucleotide and protein, and gi numbers are deprecated anyway."); 411 // warned=true; 412 // } 413 // }else{ 414 // x[gi]=ncbi; 415 // } 416 // line=bf.nextLine(); 417 // } 418 // if(verbose){System.err.println("Count: "+count);} 419 // bf.close(); 420 // return count; 421 // } 422 423 private static int[] array; 424 private static String fileString; 425 426 public static boolean verbose=false; 427 private static boolean initialized=false; 428 } 429