1 package tax;
2 
3 import java.io.File;
4 import java.util.ArrayList;
5 
6 import fileIO.ByteFile;
7 import fileIO.ReadWrite;
8 import shared.Parse;
9 import shared.Shared;
10 import shared.Tools;
11 import structures.IntList;
12 
13 /**
14  * @author Brian Bushnell
15  * @date Mar 10, 2015
16  *
17  */
18 public class GiToTaxid {
19 
main(String[] args)20 	public static void main(String[] args){
21 		ReadWrite.USE_UNPIGZ=true;
22 		ReadWrite.USE_PIGZ=true;
23 		ReadWrite.ZIPLEVEL=9;
24 		ReadWrite.PIGZ_BLOCKSIZE=256;
25 //		ReadWrite.PIGZ_ITERATIONS=30;
26 
27 		for(String arg : args){
28 			String[] split=arg.split("=");
29 			String a=split[0].toLowerCase();
30 			String b=split.length>1 ? split[1] : null;
31 			shared.Parser.parseZip(arg, a, b);
32 		}
33 //		if(args.length>2 && false){//Run a test
34 //			test(args);
35 //		}else
36 		if(args.length>=2){//Write array
37 			initialize(args[0]);
38 			ReadWrite.write(array, args[1], true);
39 		}
40 	}
41 
test(String[] args)42 	public static void test(String[] args){
43 		System.err.println(getID(1000));
44 		System.err.println(getID(10000));
45 		System.err.println(getID(10001));
46 		System.err.println(getID(10002));
47 		System.err.println(getID(10003));
48 		System.err.println(getID(10004));
49 		System.err.println(getID(10005));
50 		System.err.println(getID(100000));
51 		System.err.println(getID(1000000));
52 		System.err.println(getID(10000000));
53 
54 		TaxTree tree=null;
55 		if(args.length>1){
56 			tree=TaxTree.loadTaxTree(args[0], System.err, true, true);
57 		}
58 
59 		System.err.println("Strings:");
60 		int x;
61 		x=getID("gi|18104025|emb|AJ427095.1| Ceratitis capitata centromeric or pericentromeric satellite DNA, clone 44");
62 		System.err.println(x);
63 		if(tree!=null){
64 			System.err.println(tree.getNode(x));
65 			tree.incrementRaw(x, 30);
66 		}
67 		x=getID("gi|15982920|gb|AY057568.1| Arabidopsis thaliana AT5g43500/MWF20_22 mRNA, complete cds");
68 		System.err.println(x);
69 		if(tree!=null){
70 			System.err.println(tree.getNode(x));
71 			tree.incrementRaw(x, 40);
72 		}
73 		x=getID("gi|481043749|gb|KC494054.1| Plesiochorus cymbiformis isolate ST05-58 internal transcribed spacer 2, partial sequence");
74 		System.err.println(x);
75 		if(tree!=null){
76 			System.err.println(tree.getNode(x));
77 			tree.incrementRaw(x, 20);
78 		}
79 
80 		if(tree!=null){
81 			tree.percolateUp();
82 			ArrayList<TaxNode> nodes=tree.gatherNodesAtLeastLimit(35);
83 			for(TaxNode n : nodes){
84 				System.err.println(n);
85 			}
86 		}
87 	}
88 
parseGiToTaxid(String s)89 	public static int parseGiToTaxid(String s){return parseGiToTaxid(s, '|');}
parseGiToTaxid(String s, char delimiter)90 	public static int parseGiToTaxid(String s, char delimiter){
91 		int x=parseGiNumber(s, delimiter);
92 		assert(x>=0) : s;
93 		assert(array!=null) : "To use gi numbers, you must load a gi table.";
94 //		if(x>=array.length || array[x]<0){x=(int)(Math.random()*array.length);} //Test to make sure array is nonempty.
95 		if(x>=0 && x<array.length){return array[x];}
96 		assert(x<array.length) : "The GI number "+x+" is too big.\n"
97 				+ "Please update the gi table with the latest version from NCBI as per the instructions in gitable.sh.\n"
98 				+ "To ignore this problem, please run with the -da flag.\n";
99 		return -1;
100 	}
101 
102 
103 	public static int parseGiToTaxid(byte[] s){return parseGiToTaxid(s, '|');}
104 	public static int parseGiToTaxid(byte[] s, char delimiter){
105 		int x=parseGiNumber(s, delimiter);
106 		if(x>=0){return array[x];}
107 		return -1;
108 	}
109 
110 	/** Parse a gi number, or return -1 if formatted incorrectly. */
parseGiNumber(String s, char delimiter)111 	static int parseGiNumber(String s, char delimiter){
112 		if(s==null || s.length()<4){return -1;}
113 //		System.err.println("a");
114 		if(s.charAt(0)=='>'){return getID(s.substring(1), delimiter);}
115 //		System.err.println("b");
116 		if(!s.startsWith("gi")){return -1;}
117 //		System.err.println("c");
118 //		System.err.println("d");
119 		int initial=s.indexOf(delimiter);
120 //		System.err.println("e");
121 		if(initial<0){
122 			if(delimiter!='~'){
123 				delimiter='~';
124 				initial=s.indexOf(delimiter);
125 			}
126 			if(initial<0){
127 				delimiter='_';
128 				initial=s.indexOf(delimiter);
129 			}
130 			if(initial<0){return -1;}
131 //			System.err.println("f");
132 //			System.err.println("g");
133 		}
134 //		System.err.println("h");
135 		if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
136 //		System.err.println("i");
137 
138 		int number=0;
139 		for(int i=initial+1; i<s.length(); i++){
140 			char c=s.charAt(i);
141 			if(c==delimiter){break;}
142 			assert(Tools.isDigit(c));
143 			number=(number*10)+(c-'0');
144 		}
145 //		System.err.println("j: "+number);
146 		return number;
147 	}
148 
149 	/** Parse a ncbi number, or return -1 if formatted incorrectly. */
parseTaxidNumber(String s, char delimiter)150 	public static int parseTaxidNumber(String s, char delimiter){
151 		if(s==null || s.length()<5){return -1;}
152 		if(s.charAt(0)=='>'){return parseTaxidNumber(s.substring(1), delimiter);}
153 		if(!s.startsWith("ncbi") && !s.startsWith("tid")){return -1;}
154 		int initial=s.indexOf(delimiter);
155 		if(initial<0){
156 			delimiter='_';
157 			initial=s.indexOf(delimiter);
158 			if(initial<0){return -1;}
159 		}
160 		if(!Tools.isDigit(s.charAt(initial+1))){return -1;}
161 
162 		int number=0;
163 		for(int i=initial+1; i<s.length(); i++){
164 			char c=s.charAt(i);
165 			if(c==delimiter || c==' '){break;}
166 			assert(Tools.isDigit(c)) : c+"\n"+s;
167 			number=(number*10)+(c-'0');
168 		}
169 		return number;
170 	}
171 
172 
getID(String s)173 	public static int getID(String s){return getID(s, '|');}
174 	/** Get the taxID from a header starting with a taxID or gi number */
getID(String s, char delimiter)175 	public static int getID(String s, char delimiter){
176 		int x=parseTaxidNumber(s, delimiter);
177 		if(x>=0){return x;}
178 		x=parseGiNumber(s, delimiter);
179 		if(x>=0){return array[x];}
180 		return -1;
181 	}
182 
183 	/** Parse a gi number, or return -1 if formatted incorrectly. */
parseGiNumber(byte[] s, char delimiter)184 	static int parseGiNumber(byte[] s, char delimiter){
185 		if(s==null || s.length<4){return -1;}
186 		if(!Tools.startsWith(s, "gi") && !Tools.startsWith(s, ">gi")){return -1;}
187 		int initial=Tools.indexOf(s, (byte)delimiter);
188 		if(initial<0){
189 			delimiter='_';
190 			initial=Tools.indexOf(s, (byte)delimiter);
191 			if(initial<0){return -1;}
192 		}
193 		if(!Tools.isDigit(s[initial+1])){return -1;}
194 
195 		int number=0;
196 		for(int i=initial+1; i<s.length; i++){
197 			byte c=s[i];
198 			if(c==delimiter){break;}
199 			assert(Tools.isDigit(c));
200 			number=(number*10)+(c-'0');
201 		}
202 		return number;
203 	}
204 
205 	/** Parse a gi number, or return -1 if formatted incorrectly. */
parseNcbiNumber(byte[] s, char delimiter)206 	static int parseNcbiNumber(byte[] s, char delimiter){
207 		if(s==null || s.length<3){return -1;}
208 		if(!Tools.startsWith(s, "ncbi") && !Tools.startsWith(s, ">ncbi") && !Tools.startsWith(s, "tid") && !Tools.startsWith(s, ">tid")){return -1;}
209 		int initial=Tools.indexOf(s, (byte)delimiter);
210 		if(initial<0){
211 			delimiter='_';
212 			initial=Tools.indexOf(s, (byte)delimiter);
213 			if(initial<0){return -1;}
214 		}
215 		if(!Tools.isDigit(s[initial+1])){return -1;}
216 
217 		int number=0;
218 		for(int i=initial+1; i<s.length; i++){
219 			byte c=s[i];
220 			if(c==delimiter){break;}
221 			assert(Tools.isDigit(c));
222 			number=(number*10)+(c-'0');
223 		}
224 		return number;
225 	}
226 
getID(byte[] s)227 	public static int getID(byte[] s){return getID(s, '|');}
228 	/** Get the taxID from a header starting with a taxID or gi number */
getID(byte[] s, char delimiter)229 	public static int getID(byte[] s, char delimiter){
230 		int x=parseGiNumber(s, delimiter);
231 		if(x>=0){return array[x];}
232 		return parseNcbiNumber(s, delimiter);
233 	}
234 
235 	/** Get the taxID from a gi number */
getID(int gi)236 	public static int getID(int gi){
237 		assert(gi>=0) : gi;
238 		assert(gi<array.length) : gi+", "+array.length;
239 		return array[gi];
240 	}
241 
242 	public static void initialize(String fname){
243 		assert(fname!=null);
244 		if(fileString==null || !fileString.equals(fname)){
245 			synchronized(GiToTaxid.class){
246 				if(!initialized || fileString==null || !fileString.equals(fname)){
247 					fileString=fname;
248 					if(fname.contains(".int1d")){
249 						array=ReadWrite.read(int[].class, fname, true);
250 					}else{
251 						array=makeArray(fname);
252 					}
253 				}
254 				initialized=true;
255 			}
256 		}
257 	}
258 
259 	public static boolean isInitialized(){return initialized;}
260 
261 	public static synchronized void unload(){
262 		array=null;
263 		fileString=null;
264 		initialized=false;
265 	}
266 
267 	private static int[] makeArray(String fnames){
268 		String[] split;
269 		if(new File(fnames).exists()){split=new String[] {fnames};}
270 		else if(fnames.indexOf(',')>=0){split=fnames.split(",");}
271 		else if(fnames.indexOf('#')>=0){
272 			assert(fnames.indexOf("/")<0) : "Note: Wildcard # only works for relative paths in present working directory.";
273 			File dir=new File(System.getProperty("user.dir"));
274 			String prefix=fnames.substring(0, fnames.indexOf('#'));
275 			String suffix=fnames.substring(fnames.indexOf('#')+1);
276 
277 			File[] array=dir.listFiles();
278 			StringBuilder sb=new StringBuilder();
279 			String comma="";
280 			for(File f : array){
281 				String s=f.getName();
282 				if(s.startsWith(prefix) && s.startsWith(suffix)){
283 					sb.append(comma);
284 					sb.append(s);
285 					comma=",";
286 				}
287 			}
288 			split=sb.toString().split(",");
289 		}else{
290 			throw new RuntimeException("Invalid file: "+fnames);
291 		}
292 
293 		IntList list=new IntList();
294 //		assert(max<Integer.MAX_VALUE) : "Overflow.";
295 //		int[] x=new int[(int)max+1];
296 //		Arrays.fill(x, -1);
297 
298 		long total=0;
299 		for(String s : split){
300 			long count=addToList(s, list);
301 			total+=count;
302 		}
303 		return list.shrink().array;
304 	}
305 
306 	private static long addToList(String fname, IntList list){
307 		boolean warned=false;
308 		ByteFile bf=ByteFile.makeByteFile(fname, true);
309 		long count=0, invalid=0;
310 		byte[] line=bf.nextLine();
311 		while(line!=null){
Tools.isDigit(line[line.length-1])312 			if(line.length>0 && Tools.isDigit(line[line.length-1])){//Invalid lines will end with tab or na
313 				count++;
314 				int tab2=Tools.indexOfNth(line, '\t', 2);
315 				int tab3=Tools.indexOfNth(line, '\t', 1, tab2+1);
316 				assert(tab2>0 && (tab2<tab3) && tab3<line.length) : tab2+", "+tab3+", "+line.length;
317 				assert(tab2<line.length && line[tab2]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
318 				assert(tab3<line.length && line[tab3]=='\t') : tab2+", "+tab3+", '"+new String(line)+"'";
319 //				assert(false) : tab2+", "+tab3+", '"+new String(line)+"'";
320 				int tid=Parse.parseInt(line, tab2+1, tab3);
321 				int gi=Parse.parseInt(line, tab3+1, line.length);
322 				if(gi>=Shared.MAX_ARRAY_LEN || gi<0){//A gi over 2.5b was observed May 3, 2021.
323 					invalid++;
324 				}else{
325 				assert(gi>=0) : "tid="+tid+", gi="+gi+", line=\n'"+new String(line)+"'";
326 				int old=list.get(gi);
327 				assert(old==0 || old==tid) : "Contradictory entries for gi "+gi+": "+old+" -> "+tid+"\n'"+new String(line)+"'\ntab2="+tab2+", tab3="+tab3;
328 
329 				list.set(gi, tid);
330 
331 				//assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
332 //				if(x[gi]!=-1 && x[gi]!=ncbi){
333 //					if(!warned){
334 //						System.err.println("***WARNING*** For file "+fname+":\n"+
335 //								("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
336 //								"\nThis may be an error from NCBI and you may wish to report it, but it is\n"
337 //								+ "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
338 //								+ "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
339 //						warned=true;
340 //					}
341 //				}else{
342 //					x[gi]=ncbi;
343 //				}
344 				}
345 			}else{
346 				if(line.length==0){System.err.println(fname+", "+count);}//debug
347 				invalid++;
348 			}
349 			line=bf.nextLine();
350 		}
351 		if(verbose){System.err.println("Count: "+count+"; \tInvalid: "+invalid);}
352 		bf.close();
353 		return count;
354 	}
355 
356 //	private static int[] makeArrayOld(String fnames){
357 //		String[] split;
358 //		if(new File(fnames).exists()){split=new String[] {fnames};}
359 //		else{split=fnames.split(",");}
360 //
361 //		long max=0;
362 //		for(String s : split){
363 //			max=Tools.max(max, findMaxID(s));
364 //		}
365 //
366 //		assert(max<Integer.MAX_VALUE) : "Overflow.";
367 //		int[] x=new int[(int)max+1];
368 //		Arrays.fill(x, -1);
369 //
370 //		long total=0;
371 //		for(String s : split){
372 //			long count=fillArray(s, x);
373 //			total+=count;
374 //		}
375 //		return x;
376 //	}
377 //
378 //	private static long findMaxID(String fname){
379 //		ByteFile bf=ByteFile.makeByteFile(fname, true);
380 //		long count=0, max=0;
381 //		byte[] line=bf.nextLine();
382 //		while(line!=null){
383 //			count++;
384 //			int tab=Tools.indexOf(line, (byte)'\t');
385 //			long gi=Parse.parseLong(line, 0, tab);
386 //			max=Tools.max(max, gi);
387 //			line=bf.nextLine();
388 //		}
389 //		bf.close();
390 //		return max;
391 //	}
392 //
393 //	private static long fillArray(String fname, int[] x){
394 //		boolean warned=false;
395 //		ByteFile bf=ByteFile.makeByteFile(fname, true);
396 //		long count=0;
397 //		byte[] line=bf.nextLine();
398 //		while(line!=null){
399 //			count++;
400 //			int tab=Tools.indexOf(line, (byte)'\t');
401 //			int gi=Parse.parseInt(line, 0, tab);
402 //			int ncbi=Parse.parseInt(line, tab+1, line.length);
403 //			//assert(x[gi]==-1 || x[gi]==ncbi) : "Contradictory entries for gi "+gi+": "+x[gi]+" -> "+ncbi;
404 //			if(x[gi]!=-1 && x[gi]!=ncbi){
405 //				if(!warned){
406 //					System.err.println("***WARNING*** For file "+fname+":\n"+
407 //							("Contradictory entries for gi "+gi+": mapped to both taxID "+x[gi]+" and taxID "+ncbi)+
408 //							"\nThis may be an error from NCBI and you may wish to report it, but it is\n"
409 //							+ "being suppressed because NCBI data is known to contain multi-mapped gi numbers,\n"
410 //							+ "at least between nucleotide and protein, and gi numbers are deprecated anyway.");
411 //					warned=true;
412 //				}
413 //			}else{
414 //				x[gi]=ncbi;
415 //			}
416 //			line=bf.nextLine();
417 //		}
418 //		if(verbose){System.err.println("Count: "+count);}
419 //		bf.close();
420 //		return count;
421 //	}
422 
423 	private static int[] array;
424 	private static String fileString;
425 
426 	public static boolean verbose=false;
427 	private static boolean initialized=false;
428 }
429