1 package prok;
2 
3 import java.io.File;
4 
5 import dna.AminoAcid;
6 import dna.Data;
7 import fileIO.FileFormat;
8 import fileIO.ReadWrite;
9 import shared.Parse;
10 import shared.Tools;
11 import stream.ConcurrentReadInputStream;
12 import stream.Read;
13 import stream.ReadInputStream;
14 import structures.ListNum;
15 import structures.LongHashSet;
16 
17 /** Contains a lot of statics and static methods for gene-calling */
18 public abstract class ProkObject {
19 
parse(String arg, String a, String b)20 	public static boolean parse(String arg, String a, String b){
21 		if(a.equalsIgnoreCase("16sstartslop") || a.equalsIgnoreCase("ssustartslop")){
22 			ssuStartSlop=Integer.parseInt(b);
23 		}else if(a.equalsIgnoreCase("23sstartslop") || a.equalsIgnoreCase("lsustartslop")){
24 			lsuStartSlop=Integer.parseInt(b);
25 		}else if(a.equalsIgnoreCase("5sstartslop")){
26 			r5SStartSlop=Integer.parseInt(b);
27 		}else if(a.equalsIgnoreCase("16sstopslop") || a.equalsIgnoreCase("ssustopslop")){
28 			ssuStopSlop=Integer.parseInt(b);
29 		}else if(a.equalsIgnoreCase("23sstopslop") || a.equalsIgnoreCase("lsustopslop")){
30 			lsuStopSlop=Integer.parseInt(b);
31 		}else if(a.equalsIgnoreCase("5sstopslop")){
32 			r5SStopSlop=Integer.parseInt(b);
33 		}else if(a.equals("plus")){
34 			PROCESS_PLUS_STRAND=Parse.parseBoolean(b);
35 		}else if(a.equals("minus")){
36 			PROCESS_MINUS_STRAND=Parse.parseBoolean(b);
37 		}
38 
39 		else if(a.equalsIgnoreCase("min16SIdentity") || a.equalsIgnoreCase("min16SId")) {
40 			min16SIdentity=Float.parseFloat(b);
41 		}else if(a.equalsIgnoreCase("min18SIdentity") || a.equalsIgnoreCase("min18SId")) {
42 			min18SIdentity=Float.parseFloat(b);
43 		}else if(a.equalsIgnoreCase("min23SIdentity") || a.equalsIgnoreCase("min23SId")) {
44 			min23SIdentity=Float.parseFloat(b);
45 		}else if(a.equalsIgnoreCase("min5SIdentity") || a.equalsIgnoreCase("min5SId")) {
46 			min5SIdentity=Float.parseFloat(b);
47 		}
48 
49 		else if(a.equalsIgnoreCase("align16s") || a.equalsIgnoreCase("load16SSequence")){
50 			load16SSequence=Parse.parseBoolean(b);
51 		}else if(a.equalsIgnoreCase("align23s") || a.equalsIgnoreCase("load23SSequence")){
52 			load23SSequence=Parse.parseBoolean(b);
53 		}else if(a.equalsIgnoreCase("align18s") || a.equalsIgnoreCase("load18SSequence")){
54 			load18SSequence=Parse.parseBoolean(b);
55 		}else if(a.equalsIgnoreCase("align5s") || a.equalsIgnoreCase("load5SSequence")){
56 			load5SSequence=Parse.parseBoolean(b);
57 		}
58 
59 		else if(a.equalsIgnoreCase("load16skmers") || a.equalsIgnoreCase("load18skmers") || a.equalsIgnoreCase("loadssukmers")){
60 			loadSSUkmers=Parse.parseBoolean(b);
61 		}else if(a.equalsIgnoreCase("load23skmers") || a.equalsIgnoreCase("load28skmers") || a.equalsIgnoreCase("loadlsukmers")){
62 			loadLSUkmers=Parse.parseBoolean(b);
63 		}else if(a.equalsIgnoreCase("load5skmers")){
64 			load5Skmers=Parse.parseBoolean(b);
65 		}else if(a.equalsIgnoreCase("loadtrnakmers")){
66 			loadtRNAkmers=Parse.parseBoolean(b);
67 		}else if(a.equalsIgnoreCase("klongtrna")){
68 			kLongTRna=Integer.parseInt(b);
69 		}else if(a.equalsIgnoreCase("longkmers")){
70 			loadSSUkmers=loadLSUkmers=load5Skmers=loadtRNAkmers=Parse.parseBoolean(b);
71 		}else if(a.equalsIgnoreCase("klong5s")){
72 			kLong5S=Integer.parseInt(b);
73 		}else if(a.equalsIgnoreCase("klong16s") || a.equalsIgnoreCase("klong18s") || a.equalsIgnoreCase("klongssu")){
74 			kLongSSU=Integer.parseInt(b);
75 		}else if(a.equalsIgnoreCase("klong23s") || a.equalsIgnoreCase("klong28s") || a.equalsIgnoreCase("klonglsu")){
76 			kLongLSU=Integer.parseInt(b);
77 		}else if(a.equalsIgnoreCase("klongtrna")){
78 			kLongTRna=Integer.parseInt(b);
79 		}
80 
81 		else{
82 			return false;
83 		}
84 		return true;
85 	}
86 
87 	/*--------------------------------------------------------------*/
88 
processType(int type)89 	public static boolean processType(int type){
90 		return (type==CDS ? callCDS : type==r16S ? call16S : type==r23S ? call23S : type==r18S ? call18S : type==r5S ? call5S : type==tRNA ? calltRNA : true);
91 	}
92 
startSlop(int type)93 	public static int startSlop(int type) {
94 		int slop=(type==r16S ? ssuStartSlop : type==r23S ? lsuStartSlop : type==r18S ? ssuStartSlop : type==r5S ? r5SStartSlop : 9999);
95 		return slop;
96 	}
97 
stopSlop(int type)98 	public static int stopSlop(int type) {
99 		int slop=(type==r16S ? ssuStopSlop : type==r23S ? lsuStopSlop : type==r18S ? ssuStopSlop : type==r5S ? r5SStopSlop : 9999);
100 		return slop;
101 	}
102 
minID(int type)103 	public static float minID(int type) {
104 		float minIdentity=(type==r16S ? min16SIdentity : type==r23S ? min23SIdentity : type==r18S ? min18SIdentity : type==r5S ? min5SIdentity : 0);
105 		return minIdentity;
106 	}
107 
consensusReads(int type)108 	public static Read[] consensusReads(int type) {
109 		Read[] consensusReads=(type==r16S ? r16SSequence : type==r23S ? r23SSequence : type==r18S ? r18SSequence : type==r5S ? r5SSequence : null);
110 		return consensusReads;
111 	}
112 
kmerSet(int type)113 	public static LongHashSet kmerSet(int type) {
114 		LongHashSet set=(type==tRNA ? trnaKmers : type==r16S ? ssuKmers : type==r23S ? lsuKmers : type==r5S ? r5SKmers : type==r18S ? ssuKmers : null);
115 		return set;
116 	}
117 
kLongLen(int type)118 	public static int kLongLen(int type) {
119 		int kLongLen=(type==tRNA ? kLongTRna : type==r16S ? kLongSSU : type==r23S ? kLongLSU : type==r5S ? kLong5S : type==r18S ? kLongSSU : -1);
120 		return kLongLen;
121 	}
122 
flagToType(int flag)123 	public static int flagToType(int flag) {
124 		return Integer.numberOfTrailingZeros(flag)+1;
125 	}
126 
typeToFlag(int type)127 	public static byte typeToFlag(int type) {
128 		assert(type<=6);
129 		return (byte)(1<<(type-1));
130 	}
131 
callType(int type)132 	public static boolean callType(int type){//TODO: Turn these functions into array lookups
133 		if(type==CDS){return callCDS;}
134 		else if(type==tRNA){return calltRNA;}
135 		else if(type==r16S){return call16S;}
136 		else if(type==r23S){return call23S;}
137 		else if(type==r5S){return call5S;}
138 		else if(type==r18S){return call18S;}
139 		assert(false) : type;
140 		return false;
141 	}
142 
143 	/*--------------------------------------------------------------*/
144 	/*----------------          Long Kmers          ----------------*/
145 	/*--------------------------------------------------------------*/
146 
loadLongKmers()147 	public static synchronized void loadLongKmers(){
148 //		assert(ssuKmers==null);
149 //		assert(false) : load5Skmers+", "+kLong5s;
150 		if(loadedLongKmers){return;}
151 		if(loadSSUkmers){ssuKmers=loadLongKmersByType(kLongSSU, "ssu");}
152 		if(loadLSUkmers){lsuKmers=loadLongKmersByType(kLongLSU, "lsu");}
153 		if(load5Skmers){r5SKmers=loadLongKmersByType(kLong5S, "5S");}
154 		if(loadtRNAkmers){trnaKmers=loadLongKmersByType(kLongTRna, "tRNA");}
155 		loadedLongKmers=true;
156 	}
157 
158 //	private static LongHashSet loadLongKmers(StatsContainer sc, int k, String prefix){
159 //		String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa");
160 //		if(!new File(fname).exists()){
161 //			fname=fname+".gz";
162 //			if(!new File(fname).exists()){
163 //				System.err.println("Can't find "+fname);
164 //				return null;
165 //			}
166 //		}
167 //		LongHashSet set=loadLongKmers(fname, k);
168 //		sc.kmerSet=set;
169 //		sc.kLongLen=k;
170 //		return set;
171 //	}
172 
loadLongKmersByType(int k, String prefix)173 	private static LongHashSet loadLongKmersByType(int k, String prefix){
174 		String fname=Data.findPath("?"+prefix+"_"+k+"mers.fa", true);
175 		if(!new File(fname).exists()){
176 			fname=fname+".gz";
177 			if(!new File(fname).exists()){
178 				System.err.println("Can't find "+fname);
179 				return null;
180 			}
181 		}
182 		LongHashSet set=loadLongKmers(fname, k);
183 		return set;
184 	}
185 
loadLongKmers(String fname, int k)186 	private static LongHashSet loadLongKmers(String fname, int k){//TODO: Consider making this a LongHashSet.  No reason not to...
187 		FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false);
188 		ConcurrentReadInputStream cris=ConcurrentReadInputStream.getReadInputStream(-1, false, ff, null);
189 		cris.start(); //Start the stream
190 //		if(verbose){outstream.println("Started cris");}
191 
192 		LongHashSet set=new LongHashSet(1000);
193 		ListNum<Read> ln=cris.nextList();
194 		while(ln!=null && ln.size()>0){
195 			processList(ln, set, k);
196 			cris.returnList(ln);
197 			ln=cris.nextList();
198 		}
199 		if(ln!=null){cris.returnList(ln.id, ln.list==null || ln.list.isEmpty());}
200 		ReadWrite.closeStream(cris);
201 		return set;
202 	}
203 
processList(ListNum<Read> ln, LongHashSet set, int k)204 	private static LongHashSet processList(ListNum<Read> ln, LongHashSet set, int k){
205 		final long mask=~((-1L)<<(2*k));
206 		for(Read r : ln){
207 			final byte[] bases=r.bases;
208 			long kmer=0;
209 			int len=0;
210 			for(byte b : bases){
211 				final int num=AminoAcid.baseToNumber[b];
212 				if(num>=0){
213 					len++;
214 					kmer=((kmer<<2)|num)&mask;
215 					if(len>=k){
216 						set.add(kmer);
217 					}
218 				}else{
219 					len=0;
220 				}
221 			}
222 		}
223 		return set;
224 	}
225 
226 	/*--------------------------------------------------------------*/
227 	/*----------------      Consensus Sequence      ----------------*/
228 	/*--------------------------------------------------------------*/
229 
loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro)230 	public static synchronized void loadConsensusSequenceFromFile(boolean removeMito, boolean removeChloro){
231 		if(loadedConsensusSequence){return;}
232 //		assert(r16SSequence==null);
233 		if(load16SSequence){r16SSequence=loadConsensusSequenceType("16S", removeMito, removeChloro);}
234 		if(load18SSequence){r18SSequence=loadConsensusSequenceType("18S", removeMito, removeChloro);}
235 		if(load23SSequence){r23SSequence=loadConsensusSequenceType("23S", removeMito, removeChloro);}
236 		if(load5SSequence){r5SSequence=loadConsensusSequenceType("5S", removeMito, removeChloro);}
237 		if(loadtRNASequence){trnaSequence=loadConsensusSequenceType("tRNA", removeMito, removeChloro);}
238 		loadedConsensusSequence=true;
239 	}
240 
loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro)241 	public static Read[] loadConsensusSequenceType(String prefix, boolean removeMito, boolean removeChloro){
242 		String fname=null;
243 		fname=Data.findPath("?"+prefix+"_consensus_sequence.fq", false);
244 		if(fname!=null && (fname.endsWith(".jar") || new File(fname).exists())){
245 			fname=Tools.fixExtension(fname);
246 		}else{
247 			fname=Data.findPath("?"+prefix+"_consensus_sequence.fa", true);
248 			fname=Tools.fixExtension(fname);
249 			if(!fname.endsWith(".jar") && !new File(fname).exists()){
250 				System.err.println("Can't find "+fname);
251 				return null;
252 			}
253 		}
254 		Read[] array=loadConsensusSequence(fname);
255 		if(removeMito){array=stripOrganelle(array, "mito");}
256 		if(removeChloro){array=stripOrganelle(array, "plastid");}
257 		return array;
258 	}
259 
loadConsensusSequence(String fname)260 	private static Read[] loadConsensusSequence(String fname){
261 		FileFormat ff=FileFormat.testInput(fname, FileFormat.FA, null, false, false);
262 		Read[] array=ReadInputStream.toReadArray(ff, -1);
263 		return array;
264 	}
265 
stripOrganelle(Read[] array, String key)266 	private static Read[] stripOrganelle(Read[] array, String key){
267 		int removed=0;
268 		for(int j=0; j<array.length; j++){
269 			if(array[j].id.toLowerCase().startsWith(key)) {
270 				array[j]=null;
271 				removed++;
272 			}
273 		}
274 		if(removed>0){array=Tools.condenseStrict(array);}
275 		return array;
276 	}
277 
278 	/*--------------------------------------------------------------*/
279 
280 	public static final int CDS=0, tRNA=1, r16S=2, r23S=3, r5S=4, r18S=5, r28S=6, RNA=7;
281 	public static String[] typeStrings=new String[] {"CDS", "tRNA", "16S", "23S", "5S", "18S", "28S", "RNA"};
282 	public static String[] typeStrings2=new String[] {"CDS", "tRNA", "rRNA", "rRNA", "rRNA", "rRNA", "rRNA", "RNA"};
283 	public static String[] specialTypeStrings=new String[] {null, "tRNA", "16S", "23S", "5S", "18S", "28S", null};
isSpecialType(String type)284 	public static boolean isSpecialType(String type){
285 		if(type==null){return false;}
286 		for(String s : specialTypeStrings){
287 			if(type.equalsIgnoreCase(s)){return true;}
288 		}
289 		return false;
290 	}
291 
292 	public static int kInnerRNA=6;
293 	public static int kStartRNA=3;
294 	public static int kStopRNA=3;
295 
296 	public static int kLongSSU=15;
297 	public static int kLongLSU=15;
298 	public static int kLong5S=15;
299 	public static int kLongTRna=15;
300 
301 	public static float min16SIdentity=0.62f;
302 	public static float min23SIdentity=0.60f;
303 	public static float min5SIdentity=0.60f;
304 	public static float min18SIdentity=0.60f;
305 
306 	static int ssuStartSlop=200;
307 	static int ssuStopSlop=0;
308 	static int lsuStartSlop=220;
309 	static int lsuStopSlop=0;
310 	static int r5SStartSlop=50;
311 	static int r5SStopSlop=50;
312 
313 	public static boolean callCDS=true;
314 	public static boolean calltRNA=true;
315 	public static boolean call16S=true;
316 	public static boolean call23S=true;
317 	public static boolean call5S=true;
318 	public static boolean call18S=false;
319 
320 	public static LongHashSet ssuKmers=null;
321 	public static LongHashSet lsuKmers=null;
322 	public static LongHashSet r5SKmers=null;
323 	public static LongHashSet trnaKmers=null;
324 
325 	public static Read[] trnaSequence=null;
326 	public static Read[] r16SSequence=null;
327 	public static Read[] r23SSequence=null;
328 	public static Read[] r5SSequence=null;
329 	public static Read[] r18SSequence=null;
330 
331 	public static boolean PROCESS_PLUS_STRAND=true;
332 	public static boolean PROCESS_MINUS_STRAND=true;
333 
334 	public static boolean loadSSUkmers=true;
335 	public static boolean loadLSUkmers=true;
336 	public static boolean load5Skmers=true;
337 	public static boolean loadtRNAkmers=true;
338 	private static boolean loadedLongKmers=false;
339 
340 	public static boolean loadtRNASequence=false;
341 	public static boolean load16SSequence=true;
342 	public static boolean load23SSequence=true;
343 	public static boolean load5SSequence=true;
344 	public static boolean load18SSequence=true;
345 	private static boolean loadedConsensusSequence=false;
346 
347 }
348