1 package dna;
2 import java.io.Serializable;
3 
4 import shared.Shared;
5 import shared.Tools;
6 
7 
8 public class Gene implements Comparable<Gene>, Serializable{
9 
10 //	/**
11 //	 *
12 //	 */
13 	private static final long serialVersionUID = -1342555621377050981L;
14 
15 
Gene()16 	public Gene(){
17 		chromosome=-1;
18 //		nc_accession=null;
19 		symbol=null;
20 		proteinAcc=null;
21 		id=-1;
22 		mrnaAcc=null;
23 		status=-1;
24 		completeness=-1;
25 		strand=-1;
26 		codeStart=txStart=-1;
27 		codeStop=txStop=-1;
28 		exons=null;
29 		cdsStartStat=-1;
30 		cdsEndStat=-1;
31 		exonFrames=null;
32 		txLength=-1;
33 		codeLength=-1;
34 		exonLength=-1;
35 		exonCodeLength=-1;
36 		aaLength=-1;
37 		utrLength5prime=-1;
38 		utrLength3prime=-1;
39 		readCorrectly=false;
40 		untranslated=false;
41 		pseudo=false;
42 		description=null;
43 		fullDescription=null;
44 		valid=true;
45 		primarySource=-1;
46 	}
47 
Gene(byte chrom, byte strand_, int txStart_, int txStop_, int cdStart_, int cdStop_, int gid, String name_, String trans_, String protTrans_, String status_, String completeness_, Exon[] exons_, boolean untran, boolean pseudo_, boolean valid_, String primarySource_, String descript_, String fullDescript_)48 	public Gene(byte chrom, byte strand_, int txStart_, int txStop_, int cdStart_, int cdStop_, int gid,
49 			String name_, String trans_, String protTrans_, String status_, String completeness_,
50 			Exon[] exons_, boolean untran, boolean pseudo_, boolean valid_,
51 			String primarySource_, String descript_, String fullDescript_){
52 
53 		chromosome=chrom;
54 //		nc_accession=null;
55 		symbol=name_;
56 		id=gid;
57 		mrnaAcc=((trans_==null || trans_.length()<1 || trans_.equals("-")) ? null : trans_);
58 		proteinAcc=((protTrans_==null || protTrans_.length()<1 || protTrans_.equals("-")) ? null : protTrans_);
59 
60 		primarySource=primarySource_==null ? -1 : (byte)Tools.find(primarySource_, sourceCodes);
61 		description=descript_;
62 		fullDescription=fullDescript_;
63 
64 
65 		status=status_==null ? -1 : (byte)Tools.find(status_, statusCodes);
66 		completeness=completeness_==null ? -1 : (byte)Tools.find(completeness_, completenessCodes);
67 		strand=strand_;
68 
69 		exons=exons_;
70 
71 		txStart=txStart_;
72 		txStop=txStop_; //Assuming pure 0-based numbering.
73 		codeStart=cdStart_;
74 		codeStop=cdStop_; //Assuming pure 0-based numbering.
75 
76 		assert(codeStart>=txStart) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc;
77 		assert(codeStop<=txStop) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc;
78 
79 
80 //		cdsStartStat=(byte)find("?", endStatCodes);
81 //		cdsEndStat=(byte)find("?", endStatCodes);
82 		cdsStartStat=-1;
83 		cdsEndStat=-1;
84 
85 		exonFrames=null;
86 
87 		txLength=txStop-txStart+1;
88 		codeLength=(codeStop==codeStart ? 0 : codeStop-codeStart+1);
89 
90 		untranslated=untran;
91 		pseudo=pseudo_;
92 
93 		int eLen=0, ecLen=0, utr0=0, utr2=0;
94 
95 		if(exons!=null){
96 
97 			for(Exon e : exons){
98 
99 				utr0+=max(0, min(e.b, codeStart)-e.a);
100 				utr2+=max(0, e.b-max(e.a, codeStop));
101 
102 				int len=e.b-e.a+1;
103 				eLen+=len;
104 				len=(min(e.b, codeStop)-max(e.a, codeStart));
105 				len=max(0, len+1);
106 				ecLen+=len;
107 			}
108 		}
109 
110 
111 		exonLength=(eLen<2 ? 0 : eLen);
112 		exonCodeLength=(codeLength<1 || exonLength<1 ? 0 : ecLen);
113 		aaLength=exonCodeLength/3-1;
114 
115 		assert(exonLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n";
116 		assert(codeLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n";
117 
118 		//assert(exonCodeLength%3 == 0); //This should be true with a correct database
119 
120 		if(strand==Shared.PLUS){
121 			utrLength5prime=untranslated ? 0 : utr0;
122 			utrLength3prime=untranslated ? 0 : utr2;
123 		}else{
124 			utrLength5prime=untranslated ? 0 : utr2;
125 			utrLength3prime=untranslated ? 0 : utr0;
126 		}
127 
128 		//System.err.println(name+", "+exonLength+", "+exonCodeLength+(exons==null ? "" : ", "+exons.length));
129 
130 		readCorrectly=true;
131 		valid=(readCorrectly && valid_);
132 	}
133 
134 
merge(Gene g)135 	public Gene merge(Gene g){
136 
137 		assert((exons==null && g.exons==null) ||
138 				(exons!=null && g.exons!=null && exons.length==g.exons.length));
139 //		assert(exonLength==g.exonLength);
140 		assert(Math.abs(exonLength-g.exonLength)<=8) : "\n\n"+this+"\n\n"+g+"\n\n";
141 		assert(strand==g.strand);
142 //		assert(codeStart==g.codeStart);
143 //		assert(codeStop==g.codeStop);
144 
145 		String Xsymbol=symbol;
146 		String XproteinAcc=proteinAcc;
147 		int Xid=id;
148 		String XmrnaAcc=mrnaAcc;
149 		int Xstatus=status;
150 		int Xcompleteness=completeness;
151 		int XcodeStart=codeStart;
152 		int XcodeStop=codeStop;
153 		int XtxStart=txStart;
154 		int XtxStop=txStop;
155 		int XcdsStartStat=cdsStartStat;
156 		int XcdsEndStat=cdsEndStat;
157 		byte[] XexonFrames=exonFrames;
158 		int XtxLength=txLength;
159 		int XcodeLength=codeLength;
160 		int XexonLength=exonLength;
161 		int XexonCodeLength=exonCodeLength;
162 		int XaaLength=aaLength;
163 		int XutrLength5prime=utrLength5prime;
164 		int XutrLength3prime=utrLength3prime;
165 //		boolean XreadCorrectly=readCorrectly;
166 		boolean Xuntranslated=untranslated;
167 		boolean Xpseudo=pseudo;
168 		String Xdescription=description;
169 		String XfullDescription=fullDescription;
170 		boolean Xvalid=valid;
171 
172 		assert(untranslated || g.untranslated || g.codeStart>=txStart) : "\n"+this+"\n\n"+g;
173 		assert(untranslated || g.untranslated || g.codeStop<=txStop) : "\n"+this+"\n\n"+g;
174 
175 		if(Xsymbol==null){Xsymbol=g.symbol;}
176 		if(XproteinAcc==null){XproteinAcc=g.proteinAcc;}
177 		if(Xid<0){Xid=g.id;}
178 		if(XmrnaAcc==null){XmrnaAcc=g.mrnaAcc;}
179 		if(Xstatus<0){Xstatus=g.status;}
180 		if(Xcompleteness<0){Xcompleteness=g.completeness;}
181 
182 
183 		if(XcodeStart==XcodeStop && g.codeStart<g.codeStop){
184 			assert(g.codeStart>=txStart);
185 			assert(g.codeStop<=txStop);
186 			XcodeStart=g.codeStart;
187 			XcodeStop=g.codeStop;
188 		}
189 
190 		//These two should never happen...
191 		if(XtxStart<0){XtxStart=g.txStart;}
192 		if(XtxStop<0){XtxStop=g.txStop;}
193 
194 		if(XcdsStartStat<0){XcdsStartStat=g.cdsStartStat;}
195 		if(XcdsEndStat<0){XcdsEndStat=g.cdsEndStat;}
196 		if(XexonFrames==null){XexonFrames=g.exonFrames;}
197 		if(XtxLength<0){XtxLength=g.txLength;}
198 		if(XcodeLength<0){XcodeLength=g.codeLength;}
199 		if(XexonLength<0){XexonLength=g.exonLength;}
200 		if(XexonCodeLength<0){XexonCodeLength=g.exonCodeLength;}
201 		if(XaaLength<0){XaaLength=g.aaLength;}
202 		if(XutrLength5prime<0){XutrLength5prime=g.utrLength5prime;}
203 		if(XutrLength3prime<0){XutrLength3prime=g.utrLength3prime;}
204 		if(Xdescription==null){Xdescription=g.description;}
205 		if(XfullDescription==null){XfullDescription=g.fullDescription;}
206 
207 //		if(XreadCorrectly){}
208 //		if(Xuntranslated){}
209 //		if(Xpseudo){}
210 //		if(Xvalid){}
211 
212 		//TODO Note that the readCorrectly field gets lost here
213 		Gene out=new Gene(chromosome, strand, XtxStart, XtxStop, XcodeStart, XcodeStop, Xid,
214 				symbol, XmrnaAcc, XproteinAcc,
215 				Xstatus< 0 ? null : statusCodes[Xstatus], Xcompleteness<0 ? null : completenessCodes[Xcompleteness],
216 				exons, Xuntranslated, Xpseudo, Xvalid, sourceCodes[primarySource], Xdescription, XfullDescription);
217 
218 		return out;
219 	}
220 
toStrand(String s)221 	public static byte toStrand(String s){
222 		assert(s!=null && s.length()==1);
223 		final char c=s.charAt(0);
224 		if(c=='+'){return Shared.PLUS;}
225 		else if(c=='-'){return Shared.MINUS;}
226 		else if(c=='?'){return 2;}
227 		throw new RuntimeException("Unknown strand: "+s);
228 	}
229 
toChromosome(final String s)230 	public static int toChromosome(final String s){
231 ////		assert(false) : s;
232 //		String s2=s;
233 //		if(s2.endsWith("random")){s2="U";}
234 //		if(s2.startsWith("chr")){s2=s2.substring(3);}
235 //		if(s2.equals("MT")){s2="M";}
236 ////		int loc=find2(s2.toUpperCase(), chromCodes);
237 //		int loc=find3(s2.toUpperCase(), chromCodes);
238 //
239 //		if(loc<0){
240 //			if(!Tools.isDigit(s2.charAt(0))){
241 //				loc=find3("U", chromCodes);
242 //			}else{
243 //				try {
244 //					loc=Integer.parseInt(s2);
245 //				} catch (NumberFormatException e) {
246 //					throw new RuntimeException(e);
247 //				}
248 //				assert(loc>=23 && loc<=26) : loc+", "+s;
249 //			}
250 //		}
251 //		assert(loc>=0) : s;
252 //		return loc;
253 
254 		String s2=s;
255 		if(s2.startsWith("chr")){s2=s2.substring(3);}
256 		int loc=Integer.parseInt(s2);
257 
258 		assert(loc>=0) : s;
259 		return loc;
260 	}
261 
toBuild(final String s)262 	public static int toBuild(final String s){
263 		String s2=s.toLowerCase();
264 		if(s2.startsWith("build")){s2=s2.substring(5);}
265 		else if(s2.startsWith("b")){s2=s2.substring(1);}
266 		else if(s2.startsWith("hg")){s2=s2.substring(1);}
267 
268 		if(s2.startsWith("=")){s2=s2.substring(1);}
269 
270 		assert(Tools.isDigit(s2.charAt(0))) : s;
271 
272 		return Integer.parseInt(s2);
273 	}
274 
fillExons(String eStarts, String eEnds, byte chr, byte str)275 	private void fillExons(String eStarts, String eEnds, byte chr, byte str){
276 		String[] s1=eStarts.split(",");
277 		String[] s2=eEnds.split(",");
278 
279 		int last=-1;
280 
281 		for(int i=0; i<s1.length; i++){
282 			int a=Integer.parseInt(s1[i]);
283 			int b=Integer.parseInt(s2[i])-1; //Note the -1 for 0-based numbering.
284 			assert(a>last) : eStarts;
285 			last=a;
286 
287 			boolean cds=overlap(a, b, codeStart, codeStop);
288 			boolean utr=(a<codeStart || b>codeStop);
289 
290 			Exon key=new Exon(a, b, chr, str, utr, cds);
291 			Exon value=Exon.table.get(key);
292 			if(value==null){
293 				value=key;
294 				Exon.table.put(key, key);
295 			}
296 			exons[i]=value;
297 		}
298 	}
299 
fillExonsCCDS(String estring, byte chr, byte str)300 	private Exon[] fillExonsCCDS(String estring, byte chr, byte str){
301 		String[] intervals=estring.replace("[","").replace("]","").replace(" ","").split(",");
302 
303 		int last=-1;
304 
305 		Exon[] array=new Exon[intervals.length];
306 
307 		for(int i=0; i<intervals.length; i++){
308 			String[] temp=intervals[i].split("-");
309 			int a=Integer.parseInt(temp[0]);
310 			int b=Integer.parseInt(temp[1]); //Note the pure 0-based numbering.
311 			assert(a>last) : estring;
312 			last=a;
313 
314 			boolean cds=overlap(a, b, codeStart, codeStop);
315 			boolean utr=(a<codeStart || b>codeStop);
316 
317 			Exon key=new Exon(a, b, chr, str, utr, cds);
318 			Exon value=Exon.table.get(key);
319 			if(value==null){
320 				value=key;
321 				Exon.table.put(key, key);
322 			}
323 			array[i]=value;
324 		}
325 		return array;
326 	}
327 
toGeneRelativeOffset(int index)328 	public int toGeneRelativeOffset(int index){
329 
330 		int off=0;
331 
332 		if(strand==Shared.PLUS){
333 
334 			//		System.out.println();
335 			for(Exon e : exons){
336 				//			System.out.print(e+" * ");
337 
338 				int temp=0;
339 				if(e.intersects(index)){
340 					temp=(int)(index-e.a);
341 				}else if(e.a>index){
342 					break;
343 				}else{
344 					temp=e.length();
345 				}
346 				assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
347 				assert(temp>=0) : index+", "+e;
348 				off+=temp;
349 			}
350 
351 		}else if(strand==Shared.MINUS){
352 			for(int i=exons.length-1; i>=0; i--){
353 				Exon e=exons[i];
354 
355 				int temp=0;
356 				if(e.intersects(index)){
357 					temp=(int)(e.b-index);
358 				}else if(e.b<index){
359 					break;
360 				}else{
361 					temp=e.length();
362 				}
363 				assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
364 				assert(temp>=0) : index+", "+e;
365 				off+=temp;
366 			}
367 
368 		}else{assert false : strand;}
369 
370 		return off;
371 	}
372 
toExonRelativeOffset(int index)373 	public int[] toExonRelativeOffset(int index){
374 
375 		int ex=0;
376 		int off=0;
377 
378 		if(strand==0){
379 
380 			//		System.out.println();
381 			for(Exon e : exons){
382 				//			System.out.print(e+" * ");
383 
384 				int temp=0;
385 				if(e.intersects(index)){
386 					temp=(int)(index-e.a);
387 				}else if(e.a>index){
388 					break;
389 				}else{
390 					ex++;
391 				}
392 				assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
393 				assert(temp>=0) : index+", "+e;
394 				off=temp;
395 			}
396 
397 		}else if(strand==1){
398 			for(int i=exons.length-1; i>=0; i--){
399 				Exon e=exons[i];
400 
401 				int temp=0;
402 				if(e.intersects(index)){
403 					temp=(int)(e.b-index);
404 				}else if(e.b<index){
405 					break;
406 				}else{
407 					ex++;
408 				}
409 				assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length();
410 				assert(temp>=0) : index+", "+e;
411 				off=temp;
412 			}
413 
414 		}else{assert false : strand;}
415 
416 //		if((index-143053138)>-3 && (index-143053138)<3){
417 //			assert(false) : ("\n\nLooking for "+index+" in\n"+this+
418 //					"\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n");
419 //		}
420 //
421 //		if((index-143053111)>-10 && (index-143053111)<10){
422 //			assert(false) : ("\n\nLooking for "+index+" in\n"+this+
423 //					"\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n");
424 //		}
425 
426 //		if(off==1 && exons[exons.length-1].b==143053111){
427 //			assert(false) : ("\n\nLooking for "+index+" in\n"+this+
428 //					"\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n");
429 //		}
430 
431 		//		System.out.println();
432 		return new int[] {ex, off};
433 	}
434 
435 
isHypothetical()436 	public boolean isHypothetical(){
437 		return isHypothetical(symbol);
438 	}
439 
440 
isHypothetical(String s)441 	public static boolean isHypothetical(String s){
442 		if(s==null){return false;}
443 		if(s.startsWith("C") && s.contains("orf")){return true;}
444 		if(s.length()>=4 && s.startsWith("LOC") && Tools.isDigit(s.charAt(3))){return true;}
445 		return false;
446 	}
447 
448 
isNormalGene()449 	public boolean isNormalGene(){
450 		return valid && !untranslated && !pseudo && !isHypothetical();
451 	}
452 
453 
intersectsTx(int point)454 	public boolean intersectsTx(int point){
455 		return point>=txStart && point<=txStop;
456 	}
intersectsTr(int point)457 	public boolean intersectsTr(int point){
458 		assert(!untranslated);
459 		return (untranslated ? false : point>=translationStart() && point<=translationStop());
460 	}
intersectsCode(int point)461 	public boolean intersectsCode(int point){
462 //		assert(!untranslated) : "point = "+point+"\ngene = "+this;
463 //		return (untranslated ? false : point>=codeStart && point<=codeEnd);
464 		return (untranslated ? intersectsTx(point) : point>=codeStart && point<=codeStop);
465 	}
intersectsExon(int point)466 	public boolean intersectsExon(int point){
467 		for(Exon e : exons){
468 			if(e.intersects(point)){return true;}
469 		}
470 		return false;
471 	}
472 
473 	/** Note that this skips code intersection checking for untranslated genes. */
intersectsCodeAndExon(int point)474 	public boolean intersectsCodeAndExon(int point){
475 		if(!untranslated && !intersectsCode(point)){return false;}
476 		for(Exon e : exons){
477 			if(e.intersects(point)){return true;}
478 		}
479 		return false;
480 	}
481 
482 
483 	/** Note that this skips code intersection checking for untranslated genes. */
intersectsCodeAndExon(int a, int b)484 	public boolean intersectsCodeAndExon(int a, int b){
485 		if(!untranslated && !intersectsCode(a, b)){return false;}
486 		for(Exon e : exons){
487 			if(e.intersects(a, b)){return true;}
488 		}
489 		return false;
490 	}
491 
492 	/** Note that this skips code intersection checking for untranslated genes. */
intersectsIntron(int a, int b)493 	public boolean intersectsIntron(int a, int b){
494 		if(exons==null || exons.length<2){return false;}
495 		if(!overlap(a, b, exons[0].a, exons[exons.length-1].b)){return false;}
496 		for(int i=1; i<exons.length; i++){
497 			Exon e1=exons[i-1];
498 			Exon e2=exons[i];
499 			assert(e1.b<e2.a) : "\n"+e1+"\n"+e2+"\n"+this+"\n";
500 
501 			assert(a<=b && e1.b+1<=e2.a-1) : "\n"+e1+"\n"+e2+"\n"+this+"\n";
502 
503 			if(overlap(a, b, e1.b+1, e2.a-1)){return true;}
504 		}
505 		return false;
506 	}
507 
508 	/** Note that this skips code intersection checking for untranslated genes. */
509 	public boolean isDeepIntronic(int a, int b, int distFromEnds){
510 		if(exons==null){return false;}
511 		for(int i=1; i<exons.length; i++){
512 			Exon e1=exons[i-1];
513 			Exon e2=exons[i];
514 			assert(e1.b<e2.a) : "\n"+e1+"\n"+e2+"\n"+this+"\n";
515 			if(a>=e1.b+distFromEnds && b<=e2.a-distFromEnds){return true;}
516 		}
517 		return false;
518 	}
519 
520 	public boolean intersectsSplice(int a, int b){
521 		assert(b>=a);
522 		if(exons==null || exons.length<2){return false;}
523 		if(b<txStart || a>txStop){return false;}
524 		for(Exon e : exons){
525 			if(e.a>=a && e.a<=b){return true;}
526 			if(e.b>=a && e.b<=b){return true;}
527 		}
528 		return false;
529 	}
530 
531 	public boolean intersectsNearby(int a, int b){
532 		return intersectsCodeAndExon(a-NEAR, b+NEAR);
533 	}
534 
535 	private static int closestToPoint(int a, int b, int point){
536 		int a2=(a>point ? a-point : point-a);
537 		int b2=(b>point ? b-point : point-b);
538 		return a2<b2 ? a : b;
539 	}
540 
541 	/**
542 	 * @param a
543 	 * @param b
544 	 * @return {
545 	 * 	distance,<br>
546 	 * 	nearest exon number (-1 means coding start or stop),<br>
547 	 * 	side (0 means start, 1 means stop),<br>
548 	 * 	position (1 means inside, 2 means outside, 3 means both),<br>
549 	 *  site coordinate
550 	 *  }
551 	 */
552 	public int[] nearestSpliceSite(int a, int b){
553 
554 		int bestDist=999999999;
555 		int nearestExon=-1;
556 		int side=-1;
557 		int position=0;
558 		int bestSite=-1;
559 
560 		boolean strictlyIntronic=this.isDeepIntronic(a, b, 1);
561 
562 		if(!strictlyIntronic){
563 			{
564 				int point=codeStart;
565 				int x=Exon.distToPoint(a, b, point);
566 				if(x<bestDist){
567 					bestDist=x;
568 					bestSite=point;
569 					nearestExon=-1;
570 					position=0;
571 					if(a<point){position|=2;}
572 					if(b>=point){position|=1;}
573 					side=(strand==Shared.PLUS ? 0 : 1);
574 					if(strand==Shared.PLUS){
575 						side=0;
576 					}else if(strand==Shared.MINUS){
577 						side=1;
578 					}
579 				}
580 
581 				point=codeStop;
582 				x=Exon.distToPoint(a, b, point);
583 				if(x<bestDist){
584 					bestDist=x;
585 					bestSite=point;
586 					nearestExon=-1;
587 					position=0;
588 					if(b>point){position|=2;}
589 					if(a<=point){position|=1;}
590 					side=(strand==Shared.PLUS ? 1 : 0);
591 				}
592 			}
593 		}
594 
595 		for(int i=0; i<exons.length; i++){
596 			Exon e=exons[i];
597 
598 			int point=e.a;
599 			int x=Exon.distToPoint(a, b, point);
600 			if(x<bestDist){
601 				bestDist=x;
602 				bestSite=point;
603 				nearestExon=i;
604 				side=(strand==Shared.PLUS ? 0 : 1);
605 				position=0;
606 				if(a<point){position|=2;}
607 				if(b>=point){position|=1;}
608 			}
609 
610 			point=e.b;
611 			x=Exon.distToPoint(a, b, point);
612 			if(x<bestDist){
613 				bestDist=x;
614 				bestSite=point;
615 				nearestExon=i;
616 				side=(strand==Shared.PLUS ? 1 : 0);
617 				position=0;
618 				if(b>point){position|=2;}
619 				if(a<=point){position|=1;}
620 			}
621 		}
622 
623 		if(nearestExon>=0 && strand==Shared.MINUS){
624 			nearestExon=exons.length-nearestExon-1;
625 		}
626 
627 		return new int[] {bestDist, nearestExon, side, position, bestSite};
628 	}
629 
630 
631 
632 	public boolean intersectsTx(int a, int b){
633 		assert(a<=b);
634 		return overlap(a, b, txStart, txStop);
635 	}
636 	public boolean intersectsTr(int a, int b){
637 		assert(a<=b);
638 		assert(!untranslated);
639 		return (untranslated ? false : overlap(a, b, translationStart(), translationStop()));
640 	}
641 	public boolean intersectsCode(int a, int b){
642 		assert(a<=b);
643 //		assert(!untranslated) : "a="+a+", b="+b+"\ngene = "+this;
644 //		return (untranslated ? false : overlap(a, b, codeStart, codeEnd));
645 		return (untranslated ? intersectsTx(a, b) : overlap(a, b, codeStart, codeStop));
646 	}
647 	public boolean intersectsExon(int a, int b){
648 //		if(!intersectsCode(a, b)){return false;}
649 		assert(a<=b);
650 		for(Exon e : exons){
651 			if(e.intersects(a, b)){return true;}
652 		}
653 		return false;
654 	}
655 	public boolean intersectsUTR(int a, int b){
656 		if(!intersectsTx(a,b)){return false;}
657 		if(untranslated){return true;}
658 		if(overlap(a, b, txStart, codeStart)){return true;}
659 		if(overlap(a, b, codeStop, txStop)){return true;}
660 		return false;
661 	}
662 	/** Downstream */
663 	public boolean intersectsUTR3(int a, int b){
664 		if(!intersectsTx(a,b)){return false;}
665 		if(untranslated){return false;}
666 		if(strand==Shared.MINUS){
667 			if(overlap(a, b, txStart, codeStart)){return true;}
668 		}else{
669 			if(overlap(a, b, codeStop, txStop)){return true;}
670 		}
671 		return false;
672 	}
673 	/** Upstream */
674 	public boolean intersectsUTR5(int a, int b){
675 		if(!intersectsTx(a,b)){return false;}
676 		if(untranslated){return false;}
677 		if(strand==Shared.PLUS){
678 			if(overlap(a, b, txStart, codeStart)){return true;}
679 		}else{
680 			if(overlap(a, b, codeStop, txStop)){return true;}
681 		}
682 		return false;
683 	}
684 
685 	private static boolean overlap(int a1, int b1, int a2, int b2){
686 		assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2;
687 		return a2<=b1 && b2>=a1;
688 	}
689 
690 	public static final String header(){
691 		return "#chrom\tsymbol\tgeneId\tmrnaAcc\tproteinAcc" +
692 				"\tstrand\tcodeStart\tcodeStop\ttxStart\ttxStop" +
693 				"\t(UNTRANSLATED?)\t(PSEUDOGENE?)\tstatus\tcompleteness\tsource" +
694 				"\t[exon0start-exon0stop, ...exonNstart-exonNstop]" +
695 				"\tfullName\tdescription";
696 	}
697 
698 //	public CharSequence toRefSeqFormat(){
699 //		return driver.ToRefGeneFormat.format(this);
700 //	}
701 
702 	@Override
703 	public String toString(){
704 
705 		StringBuilder sb=new StringBuilder(256);
706 
707 		sb.append(chromosome+"\t");
708 		sb.append(symbol+"\t");
709 		sb.append(id+"\t");
710 		sb.append(mrnaAcc+"\t");
711 		assert(proteinAcc==null || !proteinAcc.equals("null"));
712 		sb.append((proteinAcc==null ? "" : proteinAcc)+"\t");
713 		sb.append(Shared.strandCodes[strand]+"\t");
714 		sb.append(codeStart+"\t");
715 		sb.append(codeStop+"\t");
716 		sb.append(txStart+"\t");
717 		sb.append(txStop);
718 
719 		sb.append("\t"+(untranslated ? "UNTRANS" : ""));
720 		sb.append("\t"+(pseudo ? "PSEUDO" : ""));
721 
722 		sb.append("\t"+(status>=0 ? statusCodes[status] : ""));
723 		sb.append("\t"+(completeness>=0 ? completenessCodes[completeness] : ""));
724 		sb.append("\t"+(primarySource>=0 ? sourceCodes[primarySource] : ""));
725 
726 		sb.append("\t[");
727 		String comma="";
728 		for(int i=0; exons!=null && i<exons.length; i++){
729 			sb.append(comma+exons[i].a+"-"+exons[i].b);
730 			comma=", ";
731 		}
732 		sb.append("]");
733 
734 		assert(description==null || (!description.equals("null") && description.length()>0));
735 		sb.append("\t"+(description==null ? "" : description));
736 
737 		assert(fullDescription==null || (!fullDescription.equals("null") && fullDescription.length()>0));
738 		sb.append("\t"+(fullDescription==null ? "" : fullDescription));
739 
740 		String s=sb.toString();
741 		return Character.isWhitespace(s.charAt(0)) ? s : s.trim();
742 	}
743 
744 	public String toShortString(){
745 
746 		StringBuilder sb=new StringBuilder(256);
747 
748 		sb.append("chr"+chromosome+"\t");
749 		sb.append(symbol+"\t");
750 		sb.append(mrnaAcc+"\t");
751 		sb.append(Shared.strandCodes[strand]+"\t");
752 		sb.append("("+codeStart+" - "+codeStop+")");
753 		return sb.toString();
754 	}
755 
756 	@Override
757 	public int compareTo(Gene other){
758 		if(chromosome<other.chromosome){return -1;}
759 		if(chromosome>other.chromosome){return 1;}
760 
761 		if(txStart<other.txStart){return -1;}
762 		if(txStart>other.txStart){return 1;}
763 
764 		if(txStop<other.txStop){return -1;}
765 		if(txStop>other.txStop){return 1;}
766 
767 		if(codeStart<other.codeStart){return -1;}
768 		if(codeStart>other.codeStart){return 1;}
769 
770 		if(codeStop<other.codeStop){return -1;}
771 		if(codeStop>other.codeStop){return 1;}
772 
773 		if(exonLength<other.exonLength){return -1;}
774 		if(exonLength>other.exonLength){return 1;}
775 
776 		if(strand<other.strand){return -1;}
777 		if(strand>other.strand){return 1;}
778 
779 		if(id<other.id){return -1;}
780 		if(id>other.id){return 1;}
781 
782 		if(!symbol.equals(other.symbol)){return symbol.compareTo(other.symbol);}
783 		return mrnaAcc.compareTo(other.mrnaAcc);
784 	}
785 
786 	public boolean isIdenticalTo(Gene other){
787 		if(chromosome!=other.chromosome){return false;}
788 		if(strand!=other.strand){return false;}
789 		if(txStart!=other.txStart){return false;}
790 		if(txStop!=other.txStop){return false;}
791 		if(codeStart!=other.codeStart){return false;}
792 		if(codeStop!=other.codeStop){return false;}
793 		if(exonLength!=other.exonLength){return false;}
794 //		if(pseudo!=other.pseudo || untranslated!=other.untranslated){return false;}
795 		if(exons==null){
796 			if(other.exons!=null){return false;}
797 		}else{
798 			if(other.exons==null || (other.exons.length!=exons.length)){return false;}
799 			for(int i=0; i<exons.length; i++){
800 				Exon e1=exons[i], e2=other.exons[i];
801 				if(e1.a!=e2.a || e1.b!=e2.b){return false;}
802 				//assert(e1.equals(e2));
803 				//if(!e1.equals(e2)){return false;}
804 			}
805 		}
806 		return true;
807 	}
808 
809 	@Override
810 	public boolean equals(Object other){
811 		return equals((Gene)other);
812 	}
813 
814 	public boolean equals(Gene other){//TODO check this
815 		return this==other ? true : compareTo(other)==0;
816 	}
817 
818 	@Override
819 	public int hashCode(){
820 		int xor=txStart^(Integer.rotateLeft(txStop, 16));
821 		xor^=(chromosome<<20);
822 		xor^=strand;
823 		return xor;
824 	}
825 
826 	public int translationStart(){ //TODO Make into a field
827 		return (exons==null || exons.length==0) ? codeStart : exons[0].a;
828 	}
829 
830 	public int translationStop(){ //TODO Make into a field
831 		return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b;
832 	}
833 
834 	public int codeStartStrandCompensated(){ //TODO Make into a field
835 		return strand==Shared.PLUS ? codeStart : codeStop;
836 	}
837 
838 	public int codeStopStrandCompensated(){ //TODO Make into a field
839 		return strand==Shared.PLUS ? codeStop : codeStart;
840 	}
841 
842 	public int translationStartStrandCompensated(){ //TODO Make into a field
843 		if(strand==Shared.PLUS){
844 			return (exons==null || exons.length==0) ? codeStart : exons[0].a;
845 		}else{
846 			return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b;
847 		}
848 	}
849 
850 	public int translationStopStrandCompensated(){ //TODO Make into a field
851 		if(strand==Shared.PLUS){
852 			return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b;
853 		}else{
854 			return (exons==null || exons.length==0) ? codeStart : exons[0].a;
855 		}
856 	}
857 
858 	public int exonStartStrandCompensated(int exNum){
859 		if(strand==Shared.PLUS){
860 			return (exons==null || exons.length==0) ? codeStart : exons[exNum].a;
861 		}else{
862 			return (exons==null || exons.length==0) ? codeStop : exons[exons.length-exNum-1].b;
863 		}
864 	}
865 
866 	public int exonStopStrandCompensated(int exNum){
867 		if(strand==Shared.PLUS){
868 			return (exons==null || exons.length==0) ? codeStop : exons[exNum].b;
869 		}else{
870 			return (exons==null || exons.length==0) ? codeStart : exons[exons.length-exNum-1].a;
871 		}
872 	}
873 
874 	public int findClosestExon(int a, int b) {
875 		if(exons==null || exons.length==0){return 0;}
876 		int best=Integer.MAX_VALUE;
877 		int exnum=-1;
878 		for(int i=0; i<exons.length; i++){
879 			Exon e=exons[i];
880 			int x=calcDistance(a, b, e.a, e.b);
881 			if(x<best){
882 				best=x;
883 				if(strand==Shared.PLUS){exnum=i;}
884 				else{exnum=exons.length-i-1;}
885 			}
886 		}
887 		assert(exnum>=0);
888 		return exnum;
889 	}
890 
891 
892 	/** Calculates the minimal distance between two ranges: (a1, b1) and (a2, b2). */
893 	public static final int calcDistance(int a1, int b1, int a2, int b2){
894 		assert(a1<=b1);
895 		assert(a2<=b2);
896 		int r;
897 		if(b1>=a2 && b2>=a1){r=0;}
898 		else if(a1>b2){r=a1-b2;}
899 		else{r=a2-b1;}
900 		assert(r>=0) : r;
901 		return r;
902 	}
903 
904 
905 	private static final int min(int x, int y){return x<y ? x : y;}
906 	private static final int max(int x, int y){return x>y ? x : y;}
907 
908 	/** Transcription start position */
909 	public final int txStart;
910 	/** Transcription end position */
911 	public final int txStop;
912 	/** Coding region start */
913 	public final int codeStart;
914 	/** Coding region end */
915 	public final int codeStop;
916 
917 	/** Length of transcribed area */
918 	public final int txLength;
919 
920 	/** Length of coding area */
921 	public final int codeLength;
922 
923 	/** Length of exons (summed) */
924 	public final int exonLength;
925 
926 	/** Length of exonic coding region */
927 	public final int exonCodeLength;
928 
929 	/** Number of amino acids (excluding stop codon) */
930 	public final int aaLength;
931 
932 	public final int utrLength5prime;
933 	public final int utrLength3prime;
934 
935 	/** Reference sequence chromosome or scaffold */
936 	public final byte chromosome;
937 	/** + or - for strand */
938 	public final byte strand;
939 	/** ? */
940 	public final byte cdsStartStat;
941 	/** ? */
942 	public final byte cdsEndStat;
943 
944 	public final boolean readCorrectly;
945 
946 	/** Array of exons used by this gene */
947 	public final Exon[] exons;
948 
949 	/** Exon frame {0,1,2}, or -1 if no frame for exon */
950 	public final byte[] exonFrames;
951 
952 	/** Name of gene (usually transcript_id from GTF) */
953 	public final String mrnaAcc;
954 
955 	/** Protein accession */
956 	public final String proteinAcc;
957 
958 	/** Alternate name (e.g. gene_id from GTF) */
959 	public final String symbol;
960 
961 	public final String description;
962 
963 	public final String fullDescription;
964 
965 	public final byte primarySource;
966 
967 	/* CCDS file format:
968 	 * chromosome	nc_accession	gene	gene_id	ccds_id	ccds_status	cds_strand	cds_from	cds_to	cds_locations	match_type */
969 
970 	/* CCDS format stuff */
971 
972 //	public final String nc_accession;
973 	public final byte status;
974 	public final byte completeness;
975 	public final int id;
976 
977 
978 	public final boolean untranslated;
979 	public final boolean pseudo;
980 	public final boolean valid;
981 
982 	public static final String[] sourceCodes={
983 		"seqGene", "knownGene", "refGene", "unionGene",
984 		"reserved1", "reserved2", "reserved3", "reserved4"
985 	};
986 
987 	/** Index with cdsStartStat and cdsEndStat */
988 	public static final String[] endStatCodes={"none", "unk", "incmpl", "cmpl"};
989 
990 	public static final String[] statusCodes={
991 		"Unknown","Reviewed","Validated","Provisional","Predicted","Inferred","Public"
992 
993 //		"Public", "Reviewed, update pending", "Reviewed, withdrawal pending",
994 //		"Withdrawn", "Withdrawn, inconsistent annotation",
995 //		"Under review, withdrawal", "Under review, update",
996 
997 	};
998 
999 	public static final String[] completenessCodes={
1000 		"Unknown","Complete5End","Complete3End","FullLength","IncompleteBothEnds","Incomplete5End","Incomplete3End","Partial"
1001 	};
1002 
1003 
1004 	/** Index with chromosome number */
1005 	public static final String[] chromCodes={"A", "1", "2", "3", "4", "5", "6",
1006 		"7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18",
1007 		"19", "20", "21", "22", "X", "Y", "M", "U"};
1008 
1009 	private static final int NEAR=Data.NEAR;
1010 
1011 	public static final byte STAT_UNKNOWN=0;
1012 	public static final byte STAT_REVIEWED=1;
1013 	public static final byte STAT_VALIDATED=2;
1014 	public static final byte STAT_PROVISIONAL=3;
1015 	public static final byte STAT_PREDICTED=4;
1016 	public static final byte STAT_INFERRED=5;
1017 	public static final byte STAT_PUBLIC=6;
1018 
1019 }
1020