1 2%{ 3#include "dyna.h" 4 5 6%} 7 8 9%{ 10#include "geneutil.h" 11 12 13%func 14Makes a new GenomicRegion with the genes 15predicted from this AlnBlock 16 17Really a wrapper around the add_Genes_to_GenomicRegion_GeneWise 18and other functionality 19%arg 20gen genomic sequence to use 21pseudo If true, predicts frameshifted genes as pseudogenes 22alb genewise alignment to predict genes from 23return a newly allocated structure 24%% 25GenomicRegion * new_GenomicRegion_from_GeneWise(Genomic * gen,boolean pseudo,AlnBlock * alb) 26{ 27 GenomicRegion * out; 28 29 out = new_GenomicRegion(gen); 30 31 add_Genes_to_GenomicRegion_new(out,alb); 32 33 return out; 34} 35 36%func 37Potential an Alnblock may have more 38than one gene due to looping models 39 40This adds all the genes to gr 41 42%arg 43gr genomic region to add genes to 44org_start start point of the dna to which the alb was made from 45org_end end point of the dna to which the alb was made from 46alb logical label alignment 47pseudo If true, frameshifted genes are predicted as pseudo genes 48root the second argument to make_name 49make_name fN a pointer to a function to actually make the name of the gene 50%% 51int add_Genes_to_GenomicRegion_GeneWise(GenomicRegion * gr,int org_start,int org_end,AlnBlock * alb,char * root,boolean pseudo,char * (*make_name)(Wise2_Genomic *,char *,int ,Wise2_Gene * )) 52{ 53 int count = 0; 54 Gene * gene; 55 AlnColumn * alc; 56 57 alc = alb->start; 58 59 while( (gene = Gene_from_AlnColumn_GeneWise(alc,org_start,org_end,pseudo,&alc)) != NULL ) { 60 if( make_name != NULL ) { 61 gene->name = (*make_name)(gr->genomic,root,gr->len,gene); 62 } 63 if( root != NULL ) { 64 gene->seqname = stringalloc(root); 65 } 66 67 add_Gene_to_GenomicRegion(gr,gene); 68 count++; 69 if( alc == NULL ) 70 break; 71 } 72 73 return count; 74} 75 76 77%func 78Adds genes using the new gene function 79%% 80int add_Genes_to_GenomicRegion_new(GenomicRegion * gr,AlnBlock * alb) 81{ 82 int count = 0; 83 Gene * out; 84 Transcript * tr; 85 Transcript * new_tr; 86 Translation * ts; 87 Exon * exon; 88 89 AlnBlock * coll; 90 AlnColumn * alc; 91 AlnColumn * exon_start; 92 AlnColumn * intron_start; 93 94 int trace = 1; 95 int exon_start_coord; 96 int is_reversed = 0; 97 int temp; 98 int i; 99 100 coll = collapsed_AlnBlock(alb,1); 101 102 103 104 /* find start */ 105 for(alc = coll->start;alc != NULL;alc = alc->next ) { 106 if( strstr(alc->alu[1]->text_label,"CODON") != NULL ) { 107 108 if( trace ) 109 fprintf(stderr,"Got %s as first codon\n",alc->alu[1]->text_label); 110 111 /* got a gene */ 112 out = Gene_alloc_len(1); 113 add_Gene_to_GenomicRegion(gr,out); 114 115 tr = Transcript_alloc_std(); 116 /* don't add transcript yet, because reversing it requires arsing around with exons */ 117 118 if( strstr(alc->alu[1]->text_label,"REV") != NULL ) { 119 is_reversed = 1; 120 } else { 121 is_reversed = 0; 122 } 123 exon_start_coord = alc->alu[1]->start+1; 124 out->start = exon_start_coord; 125 /* first exon is different as there is no 3' splice site */ 126 127 exon_start = alc; 128 alc = alc->next; 129 if( alc == NULL ) { 130 warn("weird death in gene parsing"); 131 return 1; 132 } 133 134 /* handle single exon genes */ 135 if( strstr(alc->alu[1]->text_label,"SS") == NULL ) { 136 /* single exon gene */ 137 138 exon = Exon_alloc_std(); 139 exon->start = 0; 140 exon->end = alc->alu[1]->start+3 - exon_start->alu[1]->start; 141 if( trace ) 142 fprintf(stderr,"Single exon gene %s from %d to %d\n",alc->alu[1]->text_label,exon->start,exon->end); 143 144 if( is_reversed == 0 ) { 145 out->start = exon_start->alu[1]->start+1; 146 out->end = alc->alu[1]->start+4; 147 } else { 148 out->end = exon_start->alu[1]->start; 149 out->start = alc->alu[1]->start+3; 150 } 151 152 add_ex_Transcript(tr,exon); 153 ts = Translation_alloc(); 154 ts->start = 0; 155 ts->end = length_Transcript(tr); 156 ts->parent = tr; 157 add_Transcript(tr,ts); 158 tr->parent = out; 159 add_Gene(out,tr); 160 161 continue; 162 } 163 164 /* handle first exon */ 165 exon = Exon_alloc_std(); 166 add_ex_Transcript(tr,exon); 167 exon->start = exon_start->alu[1]->start+1 - exon_start_coord; 168 169 if( strstr(alc->alu[1]->text_label,"PHASE_0") != NULL) { 170 exon->end = alc->alu[1]->start +1 - out->start; 171 } else if ( strstr(alc->alu[1]->text_label,"PHASE_1") != NULL) { 172 exon->end = alc->alu[1]->start +2 - out->start; 173 } else if ( strstr(alc->alu[1]->text_label,"PHASE_2") != NULL) { 174 exon->end = alc->alu[1]->start +3 - out->start; 175 } 176 177 178 /* loop over internal exons. Alc is current on leftmost splice site */ 179 for( ;; ) { 180 if( trace ) 181 fprintf(stderr,"internal exon - this should be leftmost splice %s\n",alc->alu[1]->text_label); 182 183 if( strstr(alc->next->alu[1]->text_label,"SS") != NULL ) { 184 warn("Have a no-base intron. Conceptually possible, but highly unlikely. Probably misparamterisation somewhere..."); 185 } else { 186 alc = alc->next; 187 if( trace ) 188 fprintf(stderr,"internal exon - this should be INTRON %s\n",alc->alu[1]->text_label); 189 } 190 191 192 if( alc == NULL ) { 193 warn("Weird death inside Intron of gene parsing"); 194 return 1; 195 } 196 alc = alc->next; 197 198 if( trace ) 199 fprintf(stderr,"internal exon - this should be rightmost splice %s\n",alc->alu[1]->text_label); 200 201 202 if( alc == NULL ) { 203 warn("Weird death inside Intron of gene parsing"); 204 return 1; 205 } 206 207 if( strstr(alc->alu[1]->text_label,"SS") == NULL ) { 208 warn("At end of intron , non splice label %s",alc->alu[1]->text_label); 209 return 1; 210 } 211 212 exon_start = alc; 213 214 alc = alc->next; 215 216 if( trace ) 217 fprintf(stderr,"internal exon - this should be CODON %s\n",alc->alu[1]->text_label); 218 219 if( alc == NULL ) { 220 warn("Weird death inside Intron of gene parsing"); 221 return 1; 222 } 223 alc = alc->next; 224 if( alc == NULL ) { 225 warn("Weird death inside Intron of gene parsing"); 226 return 1; 227 } 228 229 if( trace ) 230 fprintf(stderr,"internal exon - this should be 5SS %s\n",alc->alu[1]->text_label); 231 232 /* should have a 3'SS whatever */ 233 exon = Exon_alloc_std(); 234 add_ex_Transcript(tr,exon); 235 236 if( strstr(exon_start->alu[1]->text_label,"PHASE_0") != NULL) { 237 exon->start = exon_start->alu[1]->start +4 - out->start; 238 } else if ( strstr(exon_start->alu[1]->text_label,"PHASE_1") == NULL) { 239 exon->start = exon_start->alu[1]->start +4 - out->start; 240 } else if ( strstr(exon_start->alu[1]->text_label,"PHASE_2") == NULL) { 241 exon->start = exon_start->alu[1]->start +4 - out->start; 242 } 243 244 245 if( strstr(alc->alu[1]->text_label,"SS") == NULL ) { 246 fprintf(stderr,"Breaking with no 5'SS on %s with %d\n",alc->alu[1]->text_label,alc->alu[1]->start); 247 exon->end = alc->alu[1]->start +1 - out->start; 248 break; 249 } else { 250 if( strstr(alc->alu[1]->text_label,"PHASE_0") != NULL) { 251 exon->end = alc->alu[1]->start +1 - out->start; 252 } else if ( strstr(alc->alu[1]->text_label,"PHASE_1") != NULL) { 253 exon->end = alc->alu[1]->start +2 - out->start; 254 } else if ( strstr(alc->alu[1]->text_label,"PHASE_2") != NULL) { 255 exon->end = alc->alu[1]->start +3 - out->start; 256 } 257 258 } 259 /* alc is left on the 5'SS for the next exon */ 260 } /* end of for (;;) over internal exons*/ 261 262 /* end of a gene. exon is the last exon */ 263 out->end = exon->end + out->start; 264 265 ts = Translation_alloc(); 266 ts->start = 0; 267 ts->end = length_Transcript(tr); 268 if( ts->end % 3 != 0 ) { 269 warn("Transcript is not mod 3 size. It is %d doh!",ts->end); 270 } 271 272 if( is_reversed == 1 ) { 273 if( trace ) 274 fprintf(stderr,"Reversing gene %d to %d\n",out->start,out->end); 275 temp = out->end; 276 temp = temp; 277 out->end = out->start; 278 out->start = temp; 279 280 /* reversed genes have an off by one convention */ 281 out->end -= 1; 282 out->start -= 1; 283 284 /* now have to reverse transcript. doh! */ 285 new_tr = Transcript_alloc_std(); 286 temp = out->start - out->end; 287 for(i=tr->ex_len-1;i >= 0;i-- ) { 288 exon = Exon_alloc_std(); 289 exon->start = temp - tr->exon[i]->end; 290 exon->end = temp - tr->exon[i]->start; 291 fprintf(stderr,"Adding exon %d %d which used to be %d %d\n",exon->start,exon->end,tr->exon[i]->start,tr->exon[i]->end); 292 add_ex_Transcript(new_tr,exon); 293 } 294 free_Transcript(tr); 295 tr = new_tr; 296 } 297 298 add_Gene(out,tr); 299 300 ts->parent = tr; 301 add_Transcript(tr,ts); 302 tr->parent = out; 303 304 } else { 305 if( trace ) 306 fprintf(stderr,"Skipping %s before codon\n",alc->alu[1]->text_label); 307 /* do nothing if not a codon - march on */ 308 } 309 310 } 311 312 /* free_AlnBlock(coll);*/ 313 314 return 1; 315} 316 317 318%func 319helper function for new system 320%% 321boolean is_exon_AlnColumn_new(AlnColumn * alc) 322{ 323 if( strstr(alc->alu[1]->text_label,"CODON") != NULL ) { 324 return TRUE; 325 } else { 326 return FALSE; 327 } 328 329} 330 331 332%func 333helper function for the new system 334%% 335boolean is_splice_site_AlnColumn(AlnColumn * alc,int * type,int * phase) 336{ 337 338 if( strstr(alc->alu[1]->text_label,"SS") != NULL ) { 339 340 if( strstr(alc->alu[1]->text_label,"5SS") != NULL ) { 341 *type = 5; 342 } else { 343 *type = 3; 344 } 345 346 if( strstr(alc->alu[1]->text_label,"0") != NULL ) { 347 *phase = 0; 348 } else if ( strstr(alc->alu[1]->text_label,"1") != NULL ) { 349 *phase = 1; 350 } else if ( strstr(alc->alu[1]->text_label,"2") != NULL ) { 351 *phase = 2; 352 } 353 return TRUE; 354 } else { 355 return FALSE; 356 } 357} 358 359%func 360A new hope for building genes 361%% 362Gene * Gene_from_AlnColumn_new(AlnColumn * alc,AlnColumn ** end) 363{ 364 Gene * out; 365 Transcript * tr; 366 Translation * ts; 367 Exon * ex; 368 369 int exon_start; 370 int exon_end; 371 int phase_start; 372 int phase_end; 373 374 int is_reversed; 375 int is_3ss; 376 377 int phase; 378 int type; 379 int tmp; 380 381 AlnColumn * prev; 382 383 boolean end_gene = 0; 384 385 while(alc != NULL && is_exon_AlnColumn_new(alc) == FALSE ) 386 alc = alc->next; 387 388 if( alc == NULL ) { 389 *end = NULL; 390 return NULL; 391 } 392 393 if( strstr(alc->alu[1]->text_label,"CODON") == NULL ) { 394 warn("Bad news... exited from random columns, but not in a codon column, in a %s",alc->alu[1]->text_label); 395 * end = NULL; 396 return NULL; 397 } 398 399 /* we should have either a forward or reversed gene */ 400 401 if( strstr(alc->alu[1]->text_label,"REV") != NULL ) { 402 is_reversed = 1; 403 } else { 404 is_reversed = 0; 405 } 406 407 out = Gene_alloc_len(1); 408 tr = Transcript_alloc_std(); 409 add_Gene(out,tr); 410 411 out->start = alc->alu[1]->start +1; 412 413 phase_start = 0; 414 exon_start = 0; /* by definition, the first exon is at the start*/ 415 is_3ss = 0; 416 417 for(;;) { 418 419 if( strstr(alc->alu[1]->text_label,"GENE_EXIT") != NULL ) { 420 break; 421 } 422 423 exon_start = alc->alu[1]->start+1 - out->start; 424 425 if( is_3ss ) { 426 if( phase_start == 0 ) { 427 exon_start -= 3; 428 } else if ( phase_start == 1 ) { 429 exon_start -= 2; 430 } else if ( phase_start == 2 ) { 431 exon_start -= 1; 432 } 433 } 434 435 /* this is at the start of exon */ 436 /* go to the end of the exon */ 437 for(; alc != NULL && is_exon_AlnColumn_new(alc);alc=alc->next) { 438 fprintf(stdout,"Exonifying past %d %s\n",alc->alu[1]->start,alc->alu[1]->text_label); 439 prev = alc; 440 } 441 442 /* irregardless of the fate of this exon, we now can put it 443 * in, as if it was phase0-phase0 */ 444 445 446 exon_end = prev->alu[1]->end +1 - out->start; 447 448 449 ex = Exon_alloc_std(); 450 fprintf(stdout,"Adding exon with %d-%d\n",ex->start,ex->end); 451 add_ex_Transcript(tr,ex); 452 453 ex->start = exon_start; 454 ex->end = exon_end; 455 456 if( alc != NULL && is_splice_site_AlnColumn(alc,&type,&phase) ) { 457 phase_end = phase; 458 } else { 459 is_3ss = 0; 460 phase_end = 0; 461 end_gene = 1; 462 } 463 464 /* jigging the splice sites for the phases */ 465 466 if( phase_end == 0 ) { 467 ex->end = ex->end; 468 } else if( phase_end == 1 ) { 469 ex->end = ex->end +1; 470 } else if( phase_end == 2 ) { 471 ex->end = ex->end +2; 472 } 473 474 ex->phase = phase_start; 475 476 if( end_gene == 1 ) { 477 break; 478 } 479 480 for(; alc != NULL && !is_exon_AlnColumn_new(alc);alc=alc->next) { 481/* fprintf(stdout,"Moving past %d %s\n",alc->alu[1]->start,alc->alu[1]->text_label);*/ 482 if( is_splice_site_AlnColumn(alc,&type,&phase) ) { 483 is_3ss = 1; 484 phase_start = phase; 485 } 486 if( strstr(alc->alu[1]->text_label,"GENE_EXIT") != NULL ) { 487/* fprintf(stdout,"Exiting %d %s\n",alc->alu[1]->start,alc->alu[1]->text_label); */ 488 end_gene = 1; 489 break; 490 } 491 492 } 493 494 if( end_gene == 1 ) { 495 break; 496 } 497 498 if( alc == NULL ) { 499 break; 500 } 501 } 502 503 *end = alc; 504 out->end = ex->end + out->start; 505 506 if( is_reversed == 1 ) { 507 tmp = out->start; 508 out->start = out->end; 509 out->end = tmp; 510 } 511 512 513 ts = Translation_alloc(); 514 ts->start = 0; 515 ts->end = length_Transcript(tr); 516 ts->parent = tr; 517 add_Transcript(tr,ts); 518 519 tr->parent = out; 520 521 522 fprintf(stdout,"returning gene %d %d\n",out->start,out->end); 523 524 return out; 525} 526 527%func 528A wrap for making a gene structure from 529an AlnBlock derived from one of the genewise 530methods 531%arg 532alc Alignment column in an AlnBlock produced by genewise 533org_start offset that the genewise alignment was to the coordinate system 534org_end emd that the genewise alignment was to the coordinate system 535end w pointer to a AlnColumn * to say when it has finished with this gene 536%% 537Gene * Gene_from_AlnColumn_GeneWise(AlnColumn * alc,int org_start,int org_end,boolean predict_pseudo_for_frameshift,AlnColumn ** end) 538{ 539 Gene * out; 540 Transcript * tr; 541 Translation * ts; 542 Exon * ex; 543 AlnColumn * prev; 544 SupportingFeature * sf; 545 int score = 0; 546 int dosf = 0; 547 int phase = 0; 548 int frame_break = 0; 549 550 551 while(alc != NULL && is_random_AlnColumn_genewise(alc) == TRUE ) 552 alc = alc->next; 553 554 while (alc != NULL && strcmp(alc->alu[1]->text_label,"CODON") != 0 ) 555 alc = alc->next; 556 557 if( alc == NULL) 558 return NULL; 559 560 out = Gene_alloc_len(1); 561 tr = Transcript_alloc_std(); 562 add_Gene(out,tr); 563 564 out->start = alc->alu[1]->start +1; 565 566 score += alc->alu[0]->score[0]; 567 568 for(;;) { 569 570/* fprintf(stderr,"Top loop - alc at %s %s\n",alc->alu[1]->text_label,alc->alu[0]->text_label); */ 571 572 /* fprintf(stderr,"2 Score is %.2f %s %d %.2f\n",Score2Bits(score),alc->alu[1]->text_label,alc->alu[0]->score[0],Score2Bits(alc->alu[0]->score[0]));*/ 573 ex = Exon_alloc_std(); 574 add_ex_Transcript(tr,ex); 575 576 /* 577 * this is always the start of an exon 578 */ 579 580 dosf = 0; 581 582 if( strcmp(alc->alu[1]->text_label,"CODON") == 0 ) { 583 ex->start = alc->alu[1]->start+1 - out->start; /* coordinated in alignment coords */ 584 dosf = 1; 585 phase = 0; 586 } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_0") == 0 ) { 587 ex->start = alc->alu[1]->start +4 - out->start; 588 dosf = 1; 589 phase = 0; 590 } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_1") == 0 ) { 591 ex->start = alc->alu[1]->start +4 - out->start; 592 phase = 1; 593 } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_2") == 0 ) { 594 ex->start = alc->alu[1]->start +4 - out->start; 595 phase = 2; 596 } else { 597 ex->start = alc->alu[1]->start +1 - out->start; /* coordinated in alignment coords */ 598 phase = -1; 599 } 600 601 ex->phase = phase; 602 603 /* 604 * Exons can start in INSERTs (yuk). In which case we don't 605 * make a supporting feature 606 */ 607 608 609 if( dosf == 1 && strstartcmp(alc->alu[0]->text_label,"INSERT") != 0 ) { 610 sf = SupportingFeature_alloc(); 611 /* we fill in start and end from the exon at the moment. Not pretty */ 612 sf->hstart = alc->alu[0]->start+1; 613 sf->hstrand = 1; /* currently only got proteins. Thank the lord! */ 614 sf->start = ex->start; 615 } else { 616 sf = NULL; /* make sure we don't generate a sf here */ 617 } 618 619 620 621 for(prev=alc,alc=alc->next;alc != NULL; ) { 622 /* fprintf(stderr,"Exon loop - alc at %s %s %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,sf); */ 623 624 score += alc->alu[0]->score[0]; 625 /* fprintf(stderr,"1 Score is %.2f %s\n",Score2Bits(score),alc->alu[1]->text_label); */ 626 if( is_frameshift_AlnColumn_genewise(alc) == TRUE && predict_pseudo_for_frameshift == TRUE ) { 627 score += alc->alu[0]->score[0]; 628 fprintf(stderr,"Score is %.2f\n",Score2Bits(score)); 629 out->ispseudo = TRUE; 630 alc = alc->next; 631 continue; 632 } 633 634 if( is_random_AlnColumn_genewise(alc) == TRUE) 635 break; 636 637 if( strcmp(alc->alu[1]->text_label,"CODON") != 0 ) { 638 if( strstartcmp(alc->alu[0]->text_label,"DELETE") == 0 ) { 639 /* must add sf and start a new one */ 640 /*fprintf(stderr,"Looking at alc at %s %s %d %d %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,prev,out,sf);*/ 641 642 if( sf != NULL ) { 643 sf->end = prev->alu[1]->end+1 - out->start; 644 sf->hend = prev->alu[0]->end+1; 645 add_Exon(ex,sf); 646 sf = NULL; 647 } 648 649 /* 650 * go the end of this delete run, which are residues in the query with no 651 * target info 652 */ 653 while( alc->next != NULL && strstartcmp(alc->next->alu[0]->text_label,"DELETE") == 0 ) { 654 alc = alc->next; 655 } 656 if( alc->next != NULL && strcmp(alc->next->alu[1]->text_label,"CODON") == 0 ) { 657 /* the next position is the start of the new alignment */ 658 sf = SupportingFeature_alloc(); 659 sf->hstart = alc->next->alu[0]->start+1; 660 sf->start = alc->next->alu[1]->start+1 - out->start; 661 } else { 662 sf = NULL; 663 } 664 } else { 665 break; 666 } 667 668 } else { 669 /* it is a codon match, but it could be an insert */ 670 if( strstartcmp(alc->alu[0]->text_label,"INSERT") == 0) { 671 /* break at this point, add this supporting feature */ 672 if( sf != NULL ) { 673 sf->end = prev->alu[1]->end+1 - out->start; 674 sf->hend = prev->alu[0]->end+1; 675 add_Exon(ex,sf); 676 sf = NULL; 677 } 678 679 frame_break = 0; 680 681 /* go to the end of this insert run, watching for frameshifts */ 682 while( alc->next != NULL && strstartcmp(alc->next->alu[0]->text_label,"INSERT") == 0 ) { 683 684 if( is_frameshift_AlnColumn_genewise(alc->next) == TRUE || is_random_AlnColumn_genewise(alc->next) == TRUE) { 685 686 if( is_frameshift_AlnColumn_genewise(alc->next) == TRUE && predict_pseudo_for_frameshift == TRUE ) { 687 out->ispseudo = TRUE; 688 alc = alc->next; 689 continue; 690 } else { 691 alc = alc->next; 692 frame_break = 1; 693 break; 694 } 695 } 696 alc = alc->next; 697 } 698 if( frame_break == 1 ) { 699 break; /* out of this gene */ 700 } 701 702 if( alc->next != NULL && strcmp(alc->next->alu[1]->text_label,"CODON") == 0 ) { 703 /* the next position is the start of the new alignment */ 704 sf = SupportingFeature_alloc(); 705 /* do not understand why not having a +1 here is correct. Hmph */ 706 sf->hstart = alc->next->alu[0]->start+1; 707 sf->start = alc->next->alu[1]->start+1 - out->start; 708 } else { 709 sf = NULL; 710 } 711 712 } else { 713 /* could be the start of a run from INSERT into match */ 714 if( sf == NULL ) { 715 sf = SupportingFeature_alloc(); 716 /* we fill in start and end from the exon at the moment. Not pretty */ 717 sf->hstart = alc->alu[0]->start+1; 718 sf->hstrand = 1; /* currently only got proteins. Thank the lord! */ 719 sf->start = alc->alu[1]->start+1 - out->start; 720 } 721 } 722 } /* end of else it is a codon match */ 723 prev = alc; 724 alc = alc->next; 725 726 } 727 728 /* 729 * The exon has ended. But why? 730 */ 731 732 if( sf != NULL ) { 733 sf->hend = prev->alu[0]->end+1; 734 add_Exon(ex,sf); 735 } 736 737 if( alc == NULL ) { 738 out->end = prev->alu[1]->end +1; 739 ex->end = out->end - out->start; 740 if( sf != NULL ) { 741 sf->end = ex->end; 742 } 743 break; 744 } 745 746 if( is_random_AlnColumn_genewise(alc) == TRUE) { 747 out->end = prev->alu[1]->end +1; 748 ex->end = out->end - out->start; 749 if( sf != NULL ) { 750 sf->end = ex->end; 751 } 752 break; 753 } 754 755 756 757/* fprintf(stderr,"Exiting out of exon loop...\n");*/ 758 759 if( strcmp(alc->alu[1]->text_label,"5SS_PHASE_0") == 0 ) { 760 out->end = alc->alu[1]->start+1; 761 phase = 0; 762 } else if ( strcmp(alc->alu[1]->text_label,"5SS_PHASE_1") == 0 ) { 763 out->end = alc->alu[1]->start+2; 764 phase = 1; 765 } else if ( strcmp(alc->alu[1]->text_label,"5SS_PHASE_2") == 0 ) { 766 out->end = alc->alu[1]->start+3; 767 phase = 2; 768 } else { 769 phase = 0; 770 771 /* fixes from Steve, via an issue with Steve */ 772 if ( strncmp(prev->alu[1]->text_label,"3SS_PHASE_0",11) == 0 ) { 773 out->end = prev->alu[1]->end + 4; 774 } else if ( strncmp(prev->alu[1]->text_label,"3SS_PHASE_1",11) == 0 ) { 775 out->end = prev->alu[1]->end + 4; 776 } else if ( strncmp(prev->alu[1]->text_label,"3SS_PHASE_1",11) == 0 ) { 777 out->end = prev->alu[1]->end + 4; 778 } else { 779 out->end = prev->alu[1]->end +1; 780 } 781 782 ex->end = out->end - out->start; 783 if( sf != NULL ) { 784 sf->end = ex->end; 785 } 786 break; 787 788 } 789 790 791 /** set end of exon to the correct size from here **/ 792 ex->end = out->end - out->start; 793 794 795 /** sf is to the codon, not to the exon */ 796 if( sf != NULL ) { 797 if( phase == 0 ) { 798 sf->end = ex->end; 799 } else if ( phase == 1 ) { 800 sf->end = ex->end-1; 801 } else { 802 sf->end = ex->end-2; 803 } 804 } 805 806 807 808 while( alc != NULL && strstartcmp(alc->alu[1]->text_label,"3SS") != 0 ) { 809/* fprintf(stderr,"Intron loop - alc at %s %s %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,sf); */ 810 alc = alc->next; 811 score += alc->alu[0]->score[0]; 812 } 813 814 if( alc == NULL ) { 815 warn("Got to the end of an alignment inside an intron. Oooops!"); 816 break; 817 } 818 819 }/* back to for(;;) */ 820 821 822 if( end != NULL ) 823 *end = alc; 824 825 if( org_start < org_end) { 826 out->start += org_start-1; 827 out->end += org_start-1; 828 } else { 829 /* fprintf(stderr,"was %d to %d\n",out->start,out->end); */ 830 out->start = org_start-1 - out->start; 831 out->end = org_start-1 - out->end; 832 /* fprintf(stderr,"now %d to %d (%d)\n",out->start,out->end,org_start); */ 833 } 834 835 if( out->ispseudo == FALSE ) { 836 ts = Translation_alloc(); 837 ts->start = 0; 838 ts->end = length_Transcript(tr); 839 ts->parent = tr; 840 add_Transcript(tr,ts); 841 } 842 843 tr->parent = out; 844 845 out->bits = Score2Bits(score); 846 return out; 847} 848 849 850%func 851This function is to say what is a frameshift label 852%type internal 853%% 854boolean is_frameshift_AlnColumn_genewise(const AlnColumn * alc) 855{ 856 if( strcmp(alc->alu[1]->text_label,"SEQUENCE_INSERTION") == 0 ) { 857 return TRUE; 858 } 859 if( strcmp(alc->alu[1]->text_label,"SEQUENCE_DELETION") == 0 ) { 860 return TRUE; 861 } 862 return FALSE; 863} 864 865%func 866This function is to say where this should 867be skipped in alignment/gene prediction problems 868%type internal 869%% 870boolean is_random_AlnColumn_genewise(const AlnColumn * alc) 871{ 872 char * la; 873 874 la = alc->alu[1]->text_label; 875 876 if( strcmp(la,"RANDOM_SEQUENCE") == 0 ) 877 return TRUE; 878 if( strcmp(la,"END") == 0 ) 879 return TRUE; 880 881 la = alc->alu[0]->text_label; 882 883 if( strstr(la,"_RND_") != NULL ) 884 return TRUE; 885 886 return FALSE; 887} 888 889 890%func 891This function is to say where things are introns 892%type internal 893%% 894boolean is_intron_AlnColumn_genewise(const AlnColumn * alc) 895{ 896 char * la; 897 898 la = alc->alu[1]->text_label; 899 900 if( strcmp(la,"CENTRAL_INTRON") == 0 ) 901 return TRUE; 902 if( strcmp(la,"PYRIMIDINE_TRACT") == 0 ) 903 return TRUE; 904 if( strcmp(la,"SPACER") == 0 ) 905 return TRUE; 906 907 return FALSE; 908} 909 910 911#define GW_EXON_TYPE_UTR5 45 912#define GW_EXON_TYPE_CDS 46 913#define GW_EXON_TYPE_UTR3 47 914#define GW_EXON_TYPE_NONE 48 915 916int exon_type_AlnColumn_genomewise(AlnColumn * alc) 917{ 918 if( strcmp(alc->alu[1]->text_label,"CODON") == 0 ) { 919 return GW_EXON_TYPE_CDS; 920 } 921 922 if( strcmp(alc->alu[1]->text_label,"UTR5") == 0 ) { 923 return GW_EXON_TYPE_UTR5; 924 } 925 926 if( strcmp(alc->alu[1]->text_label,"UTR3") == 0 || strcmp(alc->alu[1]->text_label,"STOP_CODON") == 0) { 927 return GW_EXON_TYPE_UTR3; 928 } 929 930 return GW_EXON_TYPE_NONE; 931} 932 933 934void show_utr_exon_genomewise(AlnBlock * alb,FILE * ofp) 935{ 936 AlnColumn * alc; 937 int exon_start; 938 int exon_end; 939 int is_start; 940 int phase; 941 int endphase; 942 int is_3ss; 943 944 for(alc=alb->start;alc != NULL;) { 945 /* find the first exon */ 946 for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) != GW_EXON_TYPE_UTR5;alc = alc->next) 947 ; 948 if( alc == NULL ) { 949 break; 950 } 951 fprintf(ofp,"Gene\n"); 952 953 if( alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR5 ) { 954 is_start = 1; 955 while( alc != NULL ) { /* while loop goes over all 5UTRs */ 956 exon_start = alc->alu[1]->start+ (is_start ? 2 : 3); 957 is_start = 0; 958 for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR5;alc = alc->next ) { 959 ; 960 } 961 /* fprintf(stderr,"Broken out with %s\n",alc->alu[1]->text_label); */ 962 963 if( strcmp(alc->alu[1]->text_label,"UTR5_INTRON") ==0 ) { 964 /* ntron. should be +2-1 at the end of this, goes to 1*/ 965 fprintf(ofp," utr5 %d %d\n",exon_start,alc->alu[1]->start+1); 966 /* now loop through the intron */ 967 for(;alc != NULL && strcmp(alc->alu[1]->text_label,"UTR5_INTRON") == 0;alc = alc->next ) { 968 ; 969 } 970 if( alc == NULL || exon_type_AlnColumn_genomewise(alc) != GW_EXON_TYPE_UTR5 ) { 971 break; /* while loop */ 972 } else{ 973 continue; /* another utr5 exon */ 974 } 975 } else { 976 /* print this guy and break */ 977 fprintf(ofp," utr5 %d %d\n",exon_start,alc->alu[1]->start+1); 978 break; 979 } 980 } 981 } 982 983 984 /* we now should be at a CDS column */ 985 986 if( alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_CDS ) { 987 is_start = 1; 988 while( alc != NULL ) { /* while loop goes over all coding Exons */ 989 /* fprintf(stderr,"Entering codoing loop with %s\n",alc->alu[1]->text_label); */ 990 991 exon_start = alc->alu[1]->start+2; 992 if( strstr(alc->alu[1]->text_label,"3SS") != NULL ) { 993 is_3ss = 1; 994 if( strstr(alc->alu[1]->text_label,"1") != NULL ) { 995 phase = 1; 996 } else if ( strstr(alc->alu[1]->text_label,"2") != NULL ) { 997 phase = 2; 998 } else { 999 phase = 0; 1000 } 1001 alc = alc->next; 1002 } else { 1003 is_3ss = 0; 1004 phase = 0; 1005 } 1006 1007 1008 if( phase == 1 ) { 1009 exon_start += 3; 1010 } else if ( phase == 2) { 1011 exon_start += 3; 1012 } else if ( is_3ss ) { 1013 /* phase 0 and spliced needs adjusting */ 1014 exon_start += 3; 1015 } 1016 1017 1018 1019 for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_CDS ;alc = alc->next ) { 1020 ; 1021 } 1022 1023 if( strstr(alc->alu[1]->text_label,"5SS") != NULL ) { 1024 1025 exon_end = alc->alu[1]->start+1; 1026 1027 if( strstr(alc->alu[1]->text_label,"1") != NULL ) { 1028 endphase = 1; 1029 } else if ( strstr(alc->alu[1]->text_label,"2") != NULL ) { 1030 endphase = 2; 1031 } else { 1032 endphase = 0; 1033 } 1034 1035 if( endphase == 1 ) { 1036 exon_end += 1; 1037 } else if( endphase == 2 ) { 1038 exon_end += 2; 1039 } /* no change for phase 0 */ 1040 1041 /* intron. should be +1-1 at the end of this, goes to 0*/ 1042 fprintf(ofp," cds %d %d phase %d\n",exon_start,exon_end,phase); 1043 1044 /* now loop through the intron */ 1045 for(alc= alc->next;alc != NULL && strstr(alc->alu[1]->text_label,"3SS") == NULL;alc = alc->next ) { 1046 ; 1047 } 1048 if( alc == NULL || strstr(alc->alu[1]->text_label,"3SS") == NULL ) { 1049 break; /* while loop */ 1050 } else{ 1051 continue; /* another cds exon */ 1052 } 1053 } else { 1054 fprintf(ofp," cds %d %d phase %d\n",exon_start,alc->alu[1]->start+1,phase); 1055 break; 1056 } 1057 } 1058 } 1059 1060 1061 if( alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR3 ) { 1062 is_start = 1; 1063 while( alc != NULL ) { /* while loop goes over all 3UTRs */ 1064 exon_start = alc->alu[1]->start+ (is_start ? 2 : 3); 1065 is_start = 0; 1066 for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR3 ;alc = alc->next ) { 1067 ; 1068 } 1069 1070 1071 if( strstr(alc->alu[1]->text_label,"INTRON") != NULL ) { 1072 /* intron. should be +2-1 at the end of this, goes to 1*/ 1073 fprintf(ofp," utr3 %d %d\n",exon_start,alc->alu[1]->start+1); 1074 /* now loop through the intron */ 1075 for(;alc != NULL && strstr(alc->alu[1]->text_label,"INTRON") != NULL;alc = alc->next ) { 1076 ; 1077 } 1078 if( alc == NULL || exon_type_AlnColumn_genomewise(alc) != GW_EXON_TYPE_UTR3 ) { 1079 break; /* while loop */ 1080 } else{ 1081 continue; /* another utr3 exon */ 1082 } 1083 } else { 1084 fprintf(ofp," utr3 %d %d\n",exon_start,alc->alu[1]->start+1); 1085 break; 1086 } 1087 } 1088 } 1089 1090 fprintf(ofp,"End\n"); 1091 /* back to next gene */ 1092 1093 } 1094 1095 1096} 1097 1098 1099%} 1100 1101 1102 1103 1104