1 package dna; 2 import java.io.Serializable; 3 4 import shared.Shared; 5 import shared.Tools; 6 7 8 public class Gene implements Comparable<Gene>, Serializable{ 9 10 // /** 11 // * 12 // */ 13 private static final long serialVersionUID = -1342555621377050981L; 14 15 Gene()16 public Gene(){ 17 chromosome=-1; 18 // nc_accession=null; 19 symbol=null; 20 proteinAcc=null; 21 id=-1; 22 mrnaAcc=null; 23 status=-1; 24 completeness=-1; 25 strand=-1; 26 codeStart=txStart=-1; 27 codeStop=txStop=-1; 28 exons=null; 29 cdsStartStat=-1; 30 cdsEndStat=-1; 31 exonFrames=null; 32 txLength=-1; 33 codeLength=-1; 34 exonLength=-1; 35 exonCodeLength=-1; 36 aaLength=-1; 37 utrLength5prime=-1; 38 utrLength3prime=-1; 39 readCorrectly=false; 40 untranslated=false; 41 pseudo=false; 42 description=null; 43 fullDescription=null; 44 valid=true; 45 primarySource=-1; 46 } 47 Gene(byte chrom, byte strand_, int txStart_, int txStop_, int cdStart_, int cdStop_, int gid, String name_, String trans_, String protTrans_, String status_, String completeness_, Exon[] exons_, boolean untran, boolean pseudo_, boolean valid_, String primarySource_, String descript_, String fullDescript_)48 public Gene(byte chrom, byte strand_, int txStart_, int txStop_, int cdStart_, int cdStop_, int gid, 49 String name_, String trans_, String protTrans_, String status_, String completeness_, 50 Exon[] exons_, boolean untran, boolean pseudo_, boolean valid_, 51 String primarySource_, String descript_, String fullDescript_){ 52 53 chromosome=chrom; 54 // nc_accession=null; 55 symbol=name_; 56 id=gid; 57 mrnaAcc=((trans_==null || trans_.length()<1 || trans_.equals("-")) ? null : trans_); 58 proteinAcc=((protTrans_==null || protTrans_.length()<1 || protTrans_.equals("-")) ? null : protTrans_); 59 60 primarySource=primarySource_==null ? -1 : (byte)Tools.find(primarySource_, sourceCodes); 61 description=descript_; 62 fullDescription=fullDescript_; 63 64 65 status=status_==null ? -1 : (byte)Tools.find(status_, statusCodes); 66 completeness=completeness_==null ? -1 : (byte)Tools.find(completeness_, completenessCodes); 67 strand=strand_; 68 69 exons=exons_; 70 71 txStart=txStart_; 72 txStop=txStop_; //Assuming pure 0-based numbering. 73 codeStart=cdStart_; 74 codeStop=cdStop_; //Assuming pure 0-based numbering. 75 76 assert(codeStart>=txStart) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc; 77 assert(codeStop<=txStop) : "("+txStart+", "+txStop+"), ("+codeStart+", "+codeStop+") for "+mrnaAcc; 78 79 80 // cdsStartStat=(byte)find("?", endStatCodes); 81 // cdsEndStat=(byte)find("?", endStatCodes); 82 cdsStartStat=-1; 83 cdsEndStat=-1; 84 85 exonFrames=null; 86 87 txLength=txStop-txStart+1; 88 codeLength=(codeStop==codeStart ? 0 : codeStop-codeStart+1); 89 90 untranslated=untran; 91 pseudo=pseudo_; 92 93 int eLen=0, ecLen=0, utr0=0, utr2=0; 94 95 if(exons!=null){ 96 97 for(Exon e : exons){ 98 99 utr0+=max(0, min(e.b, codeStart)-e.a); 100 utr2+=max(0, e.b-max(e.a, codeStop)); 101 102 int len=e.b-e.a+1; 103 eLen+=len; 104 len=(min(e.b, codeStop)-max(e.a, codeStart)); 105 len=max(0, len+1); 106 ecLen+=len; 107 } 108 } 109 110 111 exonLength=(eLen<2 ? 0 : eLen); 112 exonCodeLength=(codeLength<1 || exonLength<1 ? 0 : ecLen); 113 aaLength=exonCodeLength/3-1; 114 115 assert(exonLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n"; 116 assert(codeLength>=exonCodeLength) : exonLength+", "+codeLength+", "+exonCodeLength+"\n"+this+"\n"; 117 118 //assert(exonCodeLength%3 == 0); //This should be true with a correct database 119 120 if(strand==Shared.PLUS){ 121 utrLength5prime=untranslated ? 0 : utr0; 122 utrLength3prime=untranslated ? 0 : utr2; 123 }else{ 124 utrLength5prime=untranslated ? 0 : utr2; 125 utrLength3prime=untranslated ? 0 : utr0; 126 } 127 128 //System.err.println(name+", "+exonLength+", "+exonCodeLength+(exons==null ? "" : ", "+exons.length)); 129 130 readCorrectly=true; 131 valid=(readCorrectly && valid_); 132 } 133 134 merge(Gene g)135 public Gene merge(Gene g){ 136 137 assert((exons==null && g.exons==null) || 138 (exons!=null && g.exons!=null && exons.length==g.exons.length)); 139 // assert(exonLength==g.exonLength); 140 assert(Math.abs(exonLength-g.exonLength)<=8) : "\n\n"+this+"\n\n"+g+"\n\n"; 141 assert(strand==g.strand); 142 // assert(codeStart==g.codeStart); 143 // assert(codeStop==g.codeStop); 144 145 String Xsymbol=symbol; 146 String XproteinAcc=proteinAcc; 147 int Xid=id; 148 String XmrnaAcc=mrnaAcc; 149 int Xstatus=status; 150 int Xcompleteness=completeness; 151 int XcodeStart=codeStart; 152 int XcodeStop=codeStop; 153 int XtxStart=txStart; 154 int XtxStop=txStop; 155 int XcdsStartStat=cdsStartStat; 156 int XcdsEndStat=cdsEndStat; 157 byte[] XexonFrames=exonFrames; 158 int XtxLength=txLength; 159 int XcodeLength=codeLength; 160 int XexonLength=exonLength; 161 int XexonCodeLength=exonCodeLength; 162 int XaaLength=aaLength; 163 int XutrLength5prime=utrLength5prime; 164 int XutrLength3prime=utrLength3prime; 165 // boolean XreadCorrectly=readCorrectly; 166 boolean Xuntranslated=untranslated; 167 boolean Xpseudo=pseudo; 168 String Xdescription=description; 169 String XfullDescription=fullDescription; 170 boolean Xvalid=valid; 171 172 assert(untranslated || g.untranslated || g.codeStart>=txStart) : "\n"+this+"\n\n"+g; 173 assert(untranslated || g.untranslated || g.codeStop<=txStop) : "\n"+this+"\n\n"+g; 174 175 if(Xsymbol==null){Xsymbol=g.symbol;} 176 if(XproteinAcc==null){XproteinAcc=g.proteinAcc;} 177 if(Xid<0){Xid=g.id;} 178 if(XmrnaAcc==null){XmrnaAcc=g.mrnaAcc;} 179 if(Xstatus<0){Xstatus=g.status;} 180 if(Xcompleteness<0){Xcompleteness=g.completeness;} 181 182 183 if(XcodeStart==XcodeStop && g.codeStart<g.codeStop){ 184 assert(g.codeStart>=txStart); 185 assert(g.codeStop<=txStop); 186 XcodeStart=g.codeStart; 187 XcodeStop=g.codeStop; 188 } 189 190 //These two should never happen... 191 if(XtxStart<0){XtxStart=g.txStart;} 192 if(XtxStop<0){XtxStop=g.txStop;} 193 194 if(XcdsStartStat<0){XcdsStartStat=g.cdsStartStat;} 195 if(XcdsEndStat<0){XcdsEndStat=g.cdsEndStat;} 196 if(XexonFrames==null){XexonFrames=g.exonFrames;} 197 if(XtxLength<0){XtxLength=g.txLength;} 198 if(XcodeLength<0){XcodeLength=g.codeLength;} 199 if(XexonLength<0){XexonLength=g.exonLength;} 200 if(XexonCodeLength<0){XexonCodeLength=g.exonCodeLength;} 201 if(XaaLength<0){XaaLength=g.aaLength;} 202 if(XutrLength5prime<0){XutrLength5prime=g.utrLength5prime;} 203 if(XutrLength3prime<0){XutrLength3prime=g.utrLength3prime;} 204 if(Xdescription==null){Xdescription=g.description;} 205 if(XfullDescription==null){XfullDescription=g.fullDescription;} 206 207 // if(XreadCorrectly){} 208 // if(Xuntranslated){} 209 // if(Xpseudo){} 210 // if(Xvalid){} 211 212 //TODO Note that the readCorrectly field gets lost here 213 Gene out=new Gene(chromosome, strand, XtxStart, XtxStop, XcodeStart, XcodeStop, Xid, 214 symbol, XmrnaAcc, XproteinAcc, 215 Xstatus< 0 ? null : statusCodes[Xstatus], Xcompleteness<0 ? null : completenessCodes[Xcompleteness], 216 exons, Xuntranslated, Xpseudo, Xvalid, sourceCodes[primarySource], Xdescription, XfullDescription); 217 218 return out; 219 } 220 toStrand(String s)221 public static byte toStrand(String s){ 222 assert(s!=null && s.length()==1); 223 final char c=s.charAt(0); 224 if(c=='+'){return Shared.PLUS;} 225 else if(c=='-'){return Shared.MINUS;} 226 else if(c=='?'){return 2;} 227 throw new RuntimeException("Unknown strand: "+s); 228 } 229 toChromosome(final String s)230 public static int toChromosome(final String s){ 231 //// assert(false) : s; 232 // String s2=s; 233 // if(s2.endsWith("random")){s2="U";} 234 // if(s2.startsWith("chr")){s2=s2.substring(3);} 235 // if(s2.equals("MT")){s2="M";} 236 //// int loc=find2(s2.toUpperCase(), chromCodes); 237 // int loc=find3(s2.toUpperCase(), chromCodes); 238 // 239 // if(loc<0){ 240 // if(!Tools.isDigit(s2.charAt(0))){ 241 // loc=find3("U", chromCodes); 242 // }else{ 243 // try { 244 // loc=Integer.parseInt(s2); 245 // } catch (NumberFormatException e) { 246 // throw new RuntimeException(e); 247 // } 248 // assert(loc>=23 && loc<=26) : loc+", "+s; 249 // } 250 // } 251 // assert(loc>=0) : s; 252 // return loc; 253 254 String s2=s; 255 if(s2.startsWith("chr")){s2=s2.substring(3);} 256 int loc=Integer.parseInt(s2); 257 258 assert(loc>=0) : s; 259 return loc; 260 } 261 toBuild(final String s)262 public static int toBuild(final String s){ 263 String s2=s.toLowerCase(); 264 if(s2.startsWith("build")){s2=s2.substring(5);} 265 else if(s2.startsWith("b")){s2=s2.substring(1);} 266 else if(s2.startsWith("hg")){s2=s2.substring(1);} 267 268 if(s2.startsWith("=")){s2=s2.substring(1);} 269 270 assert(Tools.isDigit(s2.charAt(0))) : s; 271 272 return Integer.parseInt(s2); 273 } 274 fillExons(String eStarts, String eEnds, byte chr, byte str)275 private void fillExons(String eStarts, String eEnds, byte chr, byte str){ 276 String[] s1=eStarts.split(","); 277 String[] s2=eEnds.split(","); 278 279 int last=-1; 280 281 for(int i=0; i<s1.length; i++){ 282 int a=Integer.parseInt(s1[i]); 283 int b=Integer.parseInt(s2[i])-1; //Note the -1 for 0-based numbering. 284 assert(a>last) : eStarts; 285 last=a; 286 287 boolean cds=overlap(a, b, codeStart, codeStop); 288 boolean utr=(a<codeStart || b>codeStop); 289 290 Exon key=new Exon(a, b, chr, str, utr, cds); 291 Exon value=Exon.table.get(key); 292 if(value==null){ 293 value=key; 294 Exon.table.put(key, key); 295 } 296 exons[i]=value; 297 } 298 } 299 fillExonsCCDS(String estring, byte chr, byte str)300 private Exon[] fillExonsCCDS(String estring, byte chr, byte str){ 301 String[] intervals=estring.replace("[","").replace("]","").replace(" ","").split(","); 302 303 int last=-1; 304 305 Exon[] array=new Exon[intervals.length]; 306 307 for(int i=0; i<intervals.length; i++){ 308 String[] temp=intervals[i].split("-"); 309 int a=Integer.parseInt(temp[0]); 310 int b=Integer.parseInt(temp[1]); //Note the pure 0-based numbering. 311 assert(a>last) : estring; 312 last=a; 313 314 boolean cds=overlap(a, b, codeStart, codeStop); 315 boolean utr=(a<codeStart || b>codeStop); 316 317 Exon key=new Exon(a, b, chr, str, utr, cds); 318 Exon value=Exon.table.get(key); 319 if(value==null){ 320 value=key; 321 Exon.table.put(key, key); 322 } 323 array[i]=value; 324 } 325 return array; 326 } 327 toGeneRelativeOffset(int index)328 public int toGeneRelativeOffset(int index){ 329 330 int off=0; 331 332 if(strand==Shared.PLUS){ 333 334 // System.out.println(); 335 for(Exon e : exons){ 336 // System.out.print(e+" * "); 337 338 int temp=0; 339 if(e.intersects(index)){ 340 temp=(int)(index-e.a); 341 }else if(e.a>index){ 342 break; 343 }else{ 344 temp=e.length(); 345 } 346 assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length(); 347 assert(temp>=0) : index+", "+e; 348 off+=temp; 349 } 350 351 }else if(strand==Shared.MINUS){ 352 for(int i=exons.length-1; i>=0; i--){ 353 Exon e=exons[i]; 354 355 int temp=0; 356 if(e.intersects(index)){ 357 temp=(int)(e.b-index); 358 }else if(e.b<index){ 359 break; 360 }else{ 361 temp=e.length(); 362 } 363 assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length(); 364 assert(temp>=0) : index+", "+e; 365 off+=temp; 366 } 367 368 }else{assert false : strand;} 369 370 return off; 371 } 372 toExonRelativeOffset(int index)373 public int[] toExonRelativeOffset(int index){ 374 375 int ex=0; 376 int off=0; 377 378 if(strand==0){ 379 380 // System.out.println(); 381 for(Exon e : exons){ 382 // System.out.print(e+" * "); 383 384 int temp=0; 385 if(e.intersects(index)){ 386 temp=(int)(index-e.a); 387 }else if(e.a>index){ 388 break; 389 }else{ 390 ex++; 391 } 392 assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length(); 393 assert(temp>=0) : index+", "+e; 394 off=temp; 395 } 396 397 }else if(strand==1){ 398 for(int i=exons.length-1; i>=0; i--){ 399 Exon e=exons[i]; 400 401 int temp=0; 402 if(e.intersects(index)){ 403 temp=(int)(e.b-index); 404 }else if(e.b<index){ 405 break; 406 }else{ 407 ex++; 408 } 409 assert(temp<=e.length()) : index +" \t "+e+" \t "+temp+" \t "+e.length(); 410 assert(temp>=0) : index+", "+e; 411 off=temp; 412 } 413 414 }else{assert false : strand;} 415 416 // if((index-143053138)>-3 && (index-143053138)<3){ 417 // assert(false) : ("\n\nLooking for "+index+" in\n"+this+ 418 // "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n"); 419 // } 420 // 421 // if((index-143053111)>-10 && (index-143053111)<10){ 422 // assert(false) : ("\n\nLooking for "+index+" in\n"+this+ 423 // "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n"); 424 // } 425 426 // if(off==1 && exons[exons.length-1].b==143053111){ 427 // assert(false) : ("\n\nLooking for "+index+" in\n"+this+ 428 // "\n\nwith exons\n"+Arrays.toString(exons)+"\n\nResult: "+off+"\n\n"); 429 // } 430 431 // System.out.println(); 432 return new int[] {ex, off}; 433 } 434 435 isHypothetical()436 public boolean isHypothetical(){ 437 return isHypothetical(symbol); 438 } 439 440 isHypothetical(String s)441 public static boolean isHypothetical(String s){ 442 if(s==null){return false;} 443 if(s.startsWith("C") && s.contains("orf")){return true;} 444 if(s.length()>=4 && s.startsWith("LOC") && Tools.isDigit(s.charAt(3))){return true;} 445 return false; 446 } 447 448 isNormalGene()449 public boolean isNormalGene(){ 450 return valid && !untranslated && !pseudo && !isHypothetical(); 451 } 452 453 intersectsTx(int point)454 public boolean intersectsTx(int point){ 455 return point>=txStart && point<=txStop; 456 } intersectsTr(int point)457 public boolean intersectsTr(int point){ 458 assert(!untranslated); 459 return (untranslated ? false : point>=translationStart() && point<=translationStop()); 460 } intersectsCode(int point)461 public boolean intersectsCode(int point){ 462 // assert(!untranslated) : "point = "+point+"\ngene = "+this; 463 // return (untranslated ? false : point>=codeStart && point<=codeEnd); 464 return (untranslated ? intersectsTx(point) : point>=codeStart && point<=codeStop); 465 } intersectsExon(int point)466 public boolean intersectsExon(int point){ 467 for(Exon e : exons){ 468 if(e.intersects(point)){return true;} 469 } 470 return false; 471 } 472 473 /** Note that this skips code intersection checking for untranslated genes. */ intersectsCodeAndExon(int point)474 public boolean intersectsCodeAndExon(int point){ 475 if(!untranslated && !intersectsCode(point)){return false;} 476 for(Exon e : exons){ 477 if(e.intersects(point)){return true;} 478 } 479 return false; 480 } 481 482 483 /** Note that this skips code intersection checking for untranslated genes. */ intersectsCodeAndExon(int a, int b)484 public boolean intersectsCodeAndExon(int a, int b){ 485 if(!untranslated && !intersectsCode(a, b)){return false;} 486 for(Exon e : exons){ 487 if(e.intersects(a, b)){return true;} 488 } 489 return false; 490 } 491 492 /** Note that this skips code intersection checking for untranslated genes. */ intersectsIntron(int a, int b)493 public boolean intersectsIntron(int a, int b){ 494 if(exons==null || exons.length<2){return false;} 495 if(!overlap(a, b, exons[0].a, exons[exons.length-1].b)){return false;} 496 for(int i=1; i<exons.length; i++){ 497 Exon e1=exons[i-1]; 498 Exon e2=exons[i]; 499 assert(e1.b<e2.a) : "\n"+e1+"\n"+e2+"\n"+this+"\n"; 500 501 assert(a<=b && e1.b+1<=e2.a-1) : "\n"+e1+"\n"+e2+"\n"+this+"\n"; 502 503 if(overlap(a, b, e1.b+1, e2.a-1)){return true;} 504 } 505 return false; 506 } 507 508 /** Note that this skips code intersection checking for untranslated genes. */ 509 public boolean isDeepIntronic(int a, int b, int distFromEnds){ 510 if(exons==null){return false;} 511 for(int i=1; i<exons.length; i++){ 512 Exon e1=exons[i-1]; 513 Exon e2=exons[i]; 514 assert(e1.b<e2.a) : "\n"+e1+"\n"+e2+"\n"+this+"\n"; 515 if(a>=e1.b+distFromEnds && b<=e2.a-distFromEnds){return true;} 516 } 517 return false; 518 } 519 520 public boolean intersectsSplice(int a, int b){ 521 assert(b>=a); 522 if(exons==null || exons.length<2){return false;} 523 if(b<txStart || a>txStop){return false;} 524 for(Exon e : exons){ 525 if(e.a>=a && e.a<=b){return true;} 526 if(e.b>=a && e.b<=b){return true;} 527 } 528 return false; 529 } 530 531 public boolean intersectsNearby(int a, int b){ 532 return intersectsCodeAndExon(a-NEAR, b+NEAR); 533 } 534 535 private static int closestToPoint(int a, int b, int point){ 536 int a2=(a>point ? a-point : point-a); 537 int b2=(b>point ? b-point : point-b); 538 return a2<b2 ? a : b; 539 } 540 541 /** 542 * @param a 543 * @param b 544 * @return { 545 * distance,<br> 546 * nearest exon number (-1 means coding start or stop),<br> 547 * side (0 means start, 1 means stop),<br> 548 * position (1 means inside, 2 means outside, 3 means both),<br> 549 * site coordinate 550 * } 551 */ 552 public int[] nearestSpliceSite(int a, int b){ 553 554 int bestDist=999999999; 555 int nearestExon=-1; 556 int side=-1; 557 int position=0; 558 int bestSite=-1; 559 560 boolean strictlyIntronic=this.isDeepIntronic(a, b, 1); 561 562 if(!strictlyIntronic){ 563 { 564 int point=codeStart; 565 int x=Exon.distToPoint(a, b, point); 566 if(x<bestDist){ 567 bestDist=x; 568 bestSite=point; 569 nearestExon=-1; 570 position=0; 571 if(a<point){position|=2;} 572 if(b>=point){position|=1;} 573 side=(strand==Shared.PLUS ? 0 : 1); 574 if(strand==Shared.PLUS){ 575 side=0; 576 }else if(strand==Shared.MINUS){ 577 side=1; 578 } 579 } 580 581 point=codeStop; 582 x=Exon.distToPoint(a, b, point); 583 if(x<bestDist){ 584 bestDist=x; 585 bestSite=point; 586 nearestExon=-1; 587 position=0; 588 if(b>point){position|=2;} 589 if(a<=point){position|=1;} 590 side=(strand==Shared.PLUS ? 1 : 0); 591 } 592 } 593 } 594 595 for(int i=0; i<exons.length; i++){ 596 Exon e=exons[i]; 597 598 int point=e.a; 599 int x=Exon.distToPoint(a, b, point); 600 if(x<bestDist){ 601 bestDist=x; 602 bestSite=point; 603 nearestExon=i; 604 side=(strand==Shared.PLUS ? 0 : 1); 605 position=0; 606 if(a<point){position|=2;} 607 if(b>=point){position|=1;} 608 } 609 610 point=e.b; 611 x=Exon.distToPoint(a, b, point); 612 if(x<bestDist){ 613 bestDist=x; 614 bestSite=point; 615 nearestExon=i; 616 side=(strand==Shared.PLUS ? 1 : 0); 617 position=0; 618 if(b>point){position|=2;} 619 if(a<=point){position|=1;} 620 } 621 } 622 623 if(nearestExon>=0 && strand==Shared.MINUS){ 624 nearestExon=exons.length-nearestExon-1; 625 } 626 627 return new int[] {bestDist, nearestExon, side, position, bestSite}; 628 } 629 630 631 632 public boolean intersectsTx(int a, int b){ 633 assert(a<=b); 634 return overlap(a, b, txStart, txStop); 635 } 636 public boolean intersectsTr(int a, int b){ 637 assert(a<=b); 638 assert(!untranslated); 639 return (untranslated ? false : overlap(a, b, translationStart(), translationStop())); 640 } 641 public boolean intersectsCode(int a, int b){ 642 assert(a<=b); 643 // assert(!untranslated) : "a="+a+", b="+b+"\ngene = "+this; 644 // return (untranslated ? false : overlap(a, b, codeStart, codeEnd)); 645 return (untranslated ? intersectsTx(a, b) : overlap(a, b, codeStart, codeStop)); 646 } 647 public boolean intersectsExon(int a, int b){ 648 // if(!intersectsCode(a, b)){return false;} 649 assert(a<=b); 650 for(Exon e : exons){ 651 if(e.intersects(a, b)){return true;} 652 } 653 return false; 654 } 655 public boolean intersectsUTR(int a, int b){ 656 if(!intersectsTx(a,b)){return false;} 657 if(untranslated){return true;} 658 if(overlap(a, b, txStart, codeStart)){return true;} 659 if(overlap(a, b, codeStop, txStop)){return true;} 660 return false; 661 } 662 /** Downstream */ 663 public boolean intersectsUTR3(int a, int b){ 664 if(!intersectsTx(a,b)){return false;} 665 if(untranslated){return false;} 666 if(strand==Shared.MINUS){ 667 if(overlap(a, b, txStart, codeStart)){return true;} 668 }else{ 669 if(overlap(a, b, codeStop, txStop)){return true;} 670 } 671 return false; 672 } 673 /** Upstream */ 674 public boolean intersectsUTR5(int a, int b){ 675 if(!intersectsTx(a,b)){return false;} 676 if(untranslated){return false;} 677 if(strand==Shared.PLUS){ 678 if(overlap(a, b, txStart, codeStart)){return true;} 679 }else{ 680 if(overlap(a, b, codeStop, txStop)){return true;} 681 } 682 return false; 683 } 684 685 private static boolean overlap(int a1, int b1, int a2, int b2){ 686 assert(a1<=b1 && a2<=b2) : a1+", "+b1+", "+a2+", "+b2; 687 return a2<=b1 && b2>=a1; 688 } 689 690 public static final String header(){ 691 return "#chrom\tsymbol\tgeneId\tmrnaAcc\tproteinAcc" + 692 "\tstrand\tcodeStart\tcodeStop\ttxStart\ttxStop" + 693 "\t(UNTRANSLATED?)\t(PSEUDOGENE?)\tstatus\tcompleteness\tsource" + 694 "\t[exon0start-exon0stop, ...exonNstart-exonNstop]" + 695 "\tfullName\tdescription"; 696 } 697 698 // public CharSequence toRefSeqFormat(){ 699 // return driver.ToRefGeneFormat.format(this); 700 // } 701 702 @Override 703 public String toString(){ 704 705 StringBuilder sb=new StringBuilder(256); 706 707 sb.append(chromosome+"\t"); 708 sb.append(symbol+"\t"); 709 sb.append(id+"\t"); 710 sb.append(mrnaAcc+"\t"); 711 assert(proteinAcc==null || !proteinAcc.equals("null")); 712 sb.append((proteinAcc==null ? "" : proteinAcc)+"\t"); 713 sb.append(Shared.strandCodes[strand]+"\t"); 714 sb.append(codeStart+"\t"); 715 sb.append(codeStop+"\t"); 716 sb.append(txStart+"\t"); 717 sb.append(txStop); 718 719 sb.append("\t"+(untranslated ? "UNTRANS" : "")); 720 sb.append("\t"+(pseudo ? "PSEUDO" : "")); 721 722 sb.append("\t"+(status>=0 ? statusCodes[status] : "")); 723 sb.append("\t"+(completeness>=0 ? completenessCodes[completeness] : "")); 724 sb.append("\t"+(primarySource>=0 ? sourceCodes[primarySource] : "")); 725 726 sb.append("\t["); 727 String comma=""; 728 for(int i=0; exons!=null && i<exons.length; i++){ 729 sb.append(comma+exons[i].a+"-"+exons[i].b); 730 comma=", "; 731 } 732 sb.append("]"); 733 734 assert(description==null || (!description.equals("null") && description.length()>0)); 735 sb.append("\t"+(description==null ? "" : description)); 736 737 assert(fullDescription==null || (!fullDescription.equals("null") && fullDescription.length()>0)); 738 sb.append("\t"+(fullDescription==null ? "" : fullDescription)); 739 740 String s=sb.toString(); 741 return Character.isWhitespace(s.charAt(0)) ? s : s.trim(); 742 } 743 744 public String toShortString(){ 745 746 StringBuilder sb=new StringBuilder(256); 747 748 sb.append("chr"+chromosome+"\t"); 749 sb.append(symbol+"\t"); 750 sb.append(mrnaAcc+"\t"); 751 sb.append(Shared.strandCodes[strand]+"\t"); 752 sb.append("("+codeStart+" - "+codeStop+")"); 753 return sb.toString(); 754 } 755 756 @Override 757 public int compareTo(Gene other){ 758 if(chromosome<other.chromosome){return -1;} 759 if(chromosome>other.chromosome){return 1;} 760 761 if(txStart<other.txStart){return -1;} 762 if(txStart>other.txStart){return 1;} 763 764 if(txStop<other.txStop){return -1;} 765 if(txStop>other.txStop){return 1;} 766 767 if(codeStart<other.codeStart){return -1;} 768 if(codeStart>other.codeStart){return 1;} 769 770 if(codeStop<other.codeStop){return -1;} 771 if(codeStop>other.codeStop){return 1;} 772 773 if(exonLength<other.exonLength){return -1;} 774 if(exonLength>other.exonLength){return 1;} 775 776 if(strand<other.strand){return -1;} 777 if(strand>other.strand){return 1;} 778 779 if(id<other.id){return -1;} 780 if(id>other.id){return 1;} 781 782 if(!symbol.equals(other.symbol)){return symbol.compareTo(other.symbol);} 783 return mrnaAcc.compareTo(other.mrnaAcc); 784 } 785 786 public boolean isIdenticalTo(Gene other){ 787 if(chromosome!=other.chromosome){return false;} 788 if(strand!=other.strand){return false;} 789 if(txStart!=other.txStart){return false;} 790 if(txStop!=other.txStop){return false;} 791 if(codeStart!=other.codeStart){return false;} 792 if(codeStop!=other.codeStop){return false;} 793 if(exonLength!=other.exonLength){return false;} 794 // if(pseudo!=other.pseudo || untranslated!=other.untranslated){return false;} 795 if(exons==null){ 796 if(other.exons!=null){return false;} 797 }else{ 798 if(other.exons==null || (other.exons.length!=exons.length)){return false;} 799 for(int i=0; i<exons.length; i++){ 800 Exon e1=exons[i], e2=other.exons[i]; 801 if(e1.a!=e2.a || e1.b!=e2.b){return false;} 802 //assert(e1.equals(e2)); 803 //if(!e1.equals(e2)){return false;} 804 } 805 } 806 return true; 807 } 808 809 @Override 810 public boolean equals(Object other){ 811 return equals((Gene)other); 812 } 813 814 public boolean equals(Gene other){//TODO check this 815 return this==other ? true : compareTo(other)==0; 816 } 817 818 @Override 819 public int hashCode(){ 820 int xor=txStart^(Integer.rotateLeft(txStop, 16)); 821 xor^=(chromosome<<20); 822 xor^=strand; 823 return xor; 824 } 825 826 public int translationStart(){ //TODO Make into a field 827 return (exons==null || exons.length==0) ? codeStart : exons[0].a; 828 } 829 830 public int translationStop(){ //TODO Make into a field 831 return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b; 832 } 833 834 public int codeStartStrandCompensated(){ //TODO Make into a field 835 return strand==Shared.PLUS ? codeStart : codeStop; 836 } 837 838 public int codeStopStrandCompensated(){ //TODO Make into a field 839 return strand==Shared.PLUS ? codeStop : codeStart; 840 } 841 842 public int translationStartStrandCompensated(){ //TODO Make into a field 843 if(strand==Shared.PLUS){ 844 return (exons==null || exons.length==0) ? codeStart : exons[0].a; 845 }else{ 846 return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b; 847 } 848 } 849 850 public int translationStopStrandCompensated(){ //TODO Make into a field 851 if(strand==Shared.PLUS){ 852 return (exons==null || exons.length==0) ? codeStop : exons[exons.length-1].b; 853 }else{ 854 return (exons==null || exons.length==0) ? codeStart : exons[0].a; 855 } 856 } 857 858 public int exonStartStrandCompensated(int exNum){ 859 if(strand==Shared.PLUS){ 860 return (exons==null || exons.length==0) ? codeStart : exons[exNum].a; 861 }else{ 862 return (exons==null || exons.length==0) ? codeStop : exons[exons.length-exNum-1].b; 863 } 864 } 865 866 public int exonStopStrandCompensated(int exNum){ 867 if(strand==Shared.PLUS){ 868 return (exons==null || exons.length==0) ? codeStop : exons[exNum].b; 869 }else{ 870 return (exons==null || exons.length==0) ? codeStart : exons[exons.length-exNum-1].a; 871 } 872 } 873 874 public int findClosestExon(int a, int b) { 875 if(exons==null || exons.length==0){return 0;} 876 int best=Integer.MAX_VALUE; 877 int exnum=-1; 878 for(int i=0; i<exons.length; i++){ 879 Exon e=exons[i]; 880 int x=calcDistance(a, b, e.a, e.b); 881 if(x<best){ 882 best=x; 883 if(strand==Shared.PLUS){exnum=i;} 884 else{exnum=exons.length-i-1;} 885 } 886 } 887 assert(exnum>=0); 888 return exnum; 889 } 890 891 892 /** Calculates the minimal distance between two ranges: (a1, b1) and (a2, b2). */ 893 public static final int calcDistance(int a1, int b1, int a2, int b2){ 894 assert(a1<=b1); 895 assert(a2<=b2); 896 int r; 897 if(b1>=a2 && b2>=a1){r=0;} 898 else if(a1>b2){r=a1-b2;} 899 else{r=a2-b1;} 900 assert(r>=0) : r; 901 return r; 902 } 903 904 905 private static final int min(int x, int y){return x<y ? x : y;} 906 private static final int max(int x, int y){return x>y ? x : y;} 907 908 /** Transcription start position */ 909 public final int txStart; 910 /** Transcription end position */ 911 public final int txStop; 912 /** Coding region start */ 913 public final int codeStart; 914 /** Coding region end */ 915 public final int codeStop; 916 917 /** Length of transcribed area */ 918 public final int txLength; 919 920 /** Length of coding area */ 921 public final int codeLength; 922 923 /** Length of exons (summed) */ 924 public final int exonLength; 925 926 /** Length of exonic coding region */ 927 public final int exonCodeLength; 928 929 /** Number of amino acids (excluding stop codon) */ 930 public final int aaLength; 931 932 public final int utrLength5prime; 933 public final int utrLength3prime; 934 935 /** Reference sequence chromosome or scaffold */ 936 public final byte chromosome; 937 /** + or - for strand */ 938 public final byte strand; 939 /** ? */ 940 public final byte cdsStartStat; 941 /** ? */ 942 public final byte cdsEndStat; 943 944 public final boolean readCorrectly; 945 946 /** Array of exons used by this gene */ 947 public final Exon[] exons; 948 949 /** Exon frame {0,1,2}, or -1 if no frame for exon */ 950 public final byte[] exonFrames; 951 952 /** Name of gene (usually transcript_id from GTF) */ 953 public final String mrnaAcc; 954 955 /** Protein accession */ 956 public final String proteinAcc; 957 958 /** Alternate name (e.g. gene_id from GTF) */ 959 public final String symbol; 960 961 public final String description; 962 963 public final String fullDescription; 964 965 public final byte primarySource; 966 967 /* CCDS file format: 968 * chromosome nc_accession gene gene_id ccds_id ccds_status cds_strand cds_from cds_to cds_locations match_type */ 969 970 /* CCDS format stuff */ 971 972 // public final String nc_accession; 973 public final byte status; 974 public final byte completeness; 975 public final int id; 976 977 978 public final boolean untranslated; 979 public final boolean pseudo; 980 public final boolean valid; 981 982 public static final String[] sourceCodes={ 983 "seqGene", "knownGene", "refGene", "unionGene", 984 "reserved1", "reserved2", "reserved3", "reserved4" 985 }; 986 987 /** Index with cdsStartStat and cdsEndStat */ 988 public static final String[] endStatCodes={"none", "unk", "incmpl", "cmpl"}; 989 990 public static final String[] statusCodes={ 991 "Unknown","Reviewed","Validated","Provisional","Predicted","Inferred","Public" 992 993 // "Public", "Reviewed, update pending", "Reviewed, withdrawal pending", 994 // "Withdrawn", "Withdrawn, inconsistent annotation", 995 // "Under review, withdrawal", "Under review, update", 996 997 }; 998 999 public static final String[] completenessCodes={ 1000 "Unknown","Complete5End","Complete3End","FullLength","IncompleteBothEnds","Incomplete5End","Incomplete3End","Partial" 1001 }; 1002 1003 1004 /** Index with chromosome number */ 1005 public static final String[] chromCodes={"A", "1", "2", "3", "4", "5", "6", 1006 "7", "8", "9", "10", "11", "12", "13", "14", "15", "16", "17", "18", 1007 "19", "20", "21", "22", "X", "Y", "M", "U"}; 1008 1009 private static final int NEAR=Data.NEAR; 1010 1011 public static final byte STAT_UNKNOWN=0; 1012 public static final byte STAT_REVIEWED=1; 1013 public static final byte STAT_VALIDATED=2; 1014 public static final byte STAT_PROVISIONAL=3; 1015 public static final byte STAT_PREDICTED=4; 1016 public static final byte STAT_INFERRED=5; 1017 public static final byte STAT_PUBLIC=6; 1018 1019 } 1020