1
2%{
3#include "dyna.h"
4
5
6%}
7
8
9%{
10#include "geneutil.h"
11
12
13%func
14Makes a new GenomicRegion with the genes
15predicted from this AlnBlock
16
17Really a wrapper around the add_Genes_to_GenomicRegion_GeneWise
18and other functionality
19%arg
20gen genomic sequence to use
21pseudo If true, predicts frameshifted genes as pseudogenes
22alb genewise alignment to predict genes from
23return a newly allocated structure
24%%
25GenomicRegion * new_GenomicRegion_from_GeneWise(Genomic * gen,boolean pseudo,AlnBlock * alb)
26{
27  GenomicRegion * out;
28
29  out = new_GenomicRegion(gen);
30
31  add_Genes_to_GenomicRegion_new(out,alb);
32
33  return out;
34}
35
36%func
37Potential an Alnblock may have more
38than one gene due to looping models
39
40This adds all the genes to gr
41
42%arg
43gr genomic region to add genes to
44org_start    start point of the dna to which the alb was made from
45org_end      end point of the dna to which the alb was made from
46alb          logical label alignment
47pseudo       If true, frameshifted genes are predicted as pseudo genes
48root         the second argument to make_name
49make_name fN a pointer to a function to actually make the name of the gene
50%%
51int add_Genes_to_GenomicRegion_GeneWise(GenomicRegion * gr,int org_start,int org_end,AlnBlock * alb,char * root,boolean pseudo,char * (*make_name)(Wise2_Genomic *,char *,int ,Wise2_Gene * ))
52{
53  int count = 0;
54  Gene * gene;
55  AlnColumn * alc;
56
57  alc = alb->start;
58
59  while( (gene = Gene_from_AlnColumn_GeneWise(alc,org_start,org_end,pseudo,&alc)) != NULL ) {
60    if( make_name != NULL ) {
61      gene->name = (*make_name)(gr->genomic,root,gr->len,gene);
62    }
63    if( root != NULL ) {
64      gene->seqname = stringalloc(root);
65    }
66
67    add_Gene_to_GenomicRegion(gr,gene);
68    count++;
69    if( alc == NULL )
70      break;
71  }
72
73  return count;
74}
75
76
77%func
78Adds genes using the new gene function
79%%
80int add_Genes_to_GenomicRegion_new(GenomicRegion * gr,AlnBlock * alb)
81{
82  int count = 0;
83  Gene * out;
84  Transcript * tr;
85  Transcript * new_tr;
86  Translation * ts;
87  Exon * exon;
88
89  AlnBlock * coll;
90  AlnColumn * alc;
91  AlnColumn * exon_start;
92  AlnColumn * intron_start;
93
94  int trace = 1;
95  int exon_start_coord;
96  int is_reversed = 0;
97  int temp;
98  int i;
99
100  coll = collapsed_AlnBlock(alb,1);
101
102
103
104  /* find start */
105  for(alc = coll->start;alc != NULL;alc = alc->next ) {
106    if( strstr(alc->alu[1]->text_label,"CODON") != NULL ) {
107
108      if( trace )
109	fprintf(stderr,"Got %s as first codon\n",alc->alu[1]->text_label);
110
111      /* got a gene */
112      out = Gene_alloc_len(1);
113      add_Gene_to_GenomicRegion(gr,out);
114
115      tr = Transcript_alloc_std();
116      /* don't add transcript yet, because reversing it requires arsing around with exons */
117
118      if( strstr(alc->alu[1]->text_label,"REV") != NULL ) {
119	is_reversed = 1;
120      } else {
121	is_reversed = 0;
122      }
123      exon_start_coord = alc->alu[1]->start+1;
124      out->start = exon_start_coord;
125      /* first exon is different as there is no 3' splice site */
126
127      exon_start = alc;
128      alc = alc->next;
129      if( alc == NULL ) {
130	warn("weird death in gene parsing");
131	return 1;
132      }
133
134      /* handle single exon genes */
135      if( strstr(alc->alu[1]->text_label,"SS") == NULL ) {
136	/* single exon gene */
137
138	exon = Exon_alloc_std();
139	exon->start = 0;
140	exon->end   = alc->alu[1]->start+3 - exon_start->alu[1]->start;
141	if( trace )
142	  fprintf(stderr,"Single exon gene %s from %d to %d\n",alc->alu[1]->text_label,exon->start,exon->end);
143
144	if( is_reversed == 0 ) {
145	  out->start   = exon_start->alu[1]->start+1;
146	  out->end     = alc->alu[1]->start+4;
147	} else {
148	  out->end     = exon_start->alu[1]->start;
149	  out->start   = alc->alu[1]->start+3;
150	}
151
152	add_ex_Transcript(tr,exon);
153	ts = Translation_alloc();
154	ts->start = 0;
155	ts->end = length_Transcript(tr);
156	ts->parent = tr;
157	add_Transcript(tr,ts);
158	tr->parent = out;
159	add_Gene(out,tr);
160
161	continue;
162      }
163
164      /* handle first exon */
165      exon = Exon_alloc_std();
166      add_ex_Transcript(tr,exon);
167      exon->start = exon_start->alu[1]->start+1 - exon_start_coord;
168
169      if( strstr(alc->alu[1]->text_label,"PHASE_0") != NULL) {
170	exon->end   = alc->alu[1]->start +1 - out->start;
171      } else if (  strstr(alc->alu[1]->text_label,"PHASE_1") != NULL)  {
172	exon->end   = alc->alu[1]->start +2 - out->start;
173      } else if (  strstr(alc->alu[1]->text_label,"PHASE_2") != NULL) {
174	exon->end   = alc->alu[1]->start +3 - out->start;
175      }
176
177
178      /* loop over internal exons. Alc is current on leftmost splice site */
179      for( ;; ) {
180	if( trace )
181	  fprintf(stderr,"internal exon - this should be leftmost splice %s\n",alc->alu[1]->text_label);
182
183	if( strstr(alc->next->alu[1]->text_label,"SS") != NULL ) {
184	  warn("Have a no-base intron. Conceptually possible, but highly unlikely. Probably misparamterisation somewhere...");
185	} else {
186	  alc = alc->next;
187	  if( trace )
188	    fprintf(stderr,"internal exon - this should be INTRON %s\n",alc->alu[1]->text_label);
189	}
190
191
192	if( alc == NULL ) {
193	  warn("Weird death inside Intron of gene parsing");
194	  return 1;
195	}
196	alc = alc->next;
197
198	if( trace )
199	  fprintf(stderr,"internal exon - this should be rightmost splice %s\n",alc->alu[1]->text_label);
200
201
202	if( alc == NULL ) {
203	  warn("Weird death inside Intron of gene parsing");
204	  return 1;
205	}
206
207	if( strstr(alc->alu[1]->text_label,"SS") == NULL ) {
208	  warn("At end of intron , non splice label %s",alc->alu[1]->text_label);
209	  return 1;
210	}
211
212	exon_start = alc;
213
214	alc = alc->next;
215
216	if( trace )
217	  fprintf(stderr,"internal exon - this should be CODON %s\n",alc->alu[1]->text_label);
218
219	if( alc == NULL ) {
220	  warn("Weird death inside Intron of gene parsing");
221	  return 1;
222	}
223	alc = alc->next;
224	if( alc == NULL ) {
225	  warn("Weird death inside Intron of gene parsing");
226	  return 1;
227	}
228
229	if( trace )
230	  fprintf(stderr,"internal exon - this should be 5SS %s\n",alc->alu[1]->text_label);
231
232	/* should have a 3'SS whatever */
233	exon = Exon_alloc_std();
234	add_ex_Transcript(tr,exon);
235
236	if( strstr(exon_start->alu[1]->text_label,"PHASE_0") != NULL) {
237	  exon->start   = exon_start->alu[1]->start +4 - out->start;
238	} else if (  strstr(exon_start->alu[1]->text_label,"PHASE_1") == NULL)  {
239	  exon->start   = exon_start->alu[1]->start +4 - out->start;
240	} else if (  strstr(exon_start->alu[1]->text_label,"PHASE_2") == NULL) {
241	  exon->start   = exon_start->alu[1]->start +4 - out->start;
242	}
243
244
245	if( strstr(alc->alu[1]->text_label,"SS") == NULL ) {
246	  fprintf(stderr,"Breaking with no 5'SS on %s with %d\n",alc->alu[1]->text_label,alc->alu[1]->start);
247	  exon->end = alc->alu[1]->start +1   - out->start;
248	  break;
249	}  else {
250	  if( strstr(alc->alu[1]->text_label,"PHASE_0") != NULL) {
251	    exon->end   = alc->alu[1]->start +1 - out->start;
252	  } else if (  strstr(alc->alu[1]->text_label,"PHASE_1") != NULL)  {
253	    exon->end   = alc->alu[1]->start +2 - out->start;
254	  } else if (  strstr(alc->alu[1]->text_label,"PHASE_2") != NULL) {
255	    exon->end   = alc->alu[1]->start +3 - out->start;
256	  }
257
258	}
259	/* alc is left on the 5'SS for the next exon */
260      } /* end of for (;;) over internal exons*/
261
262      /* end of a gene. exon is the last exon */
263      out->end = exon->end + out->start;
264
265      ts = Translation_alloc();
266      ts->start = 0;
267      ts->end = length_Transcript(tr);
268      if( ts->end % 3 != 0 ) {
269	warn("Transcript is not mod 3 size. It is %d doh!",ts->end);
270      }
271
272      if( is_reversed == 1 ) {
273	if( trace )
274	  fprintf(stderr,"Reversing gene %d to %d\n",out->start,out->end);
275	temp = out->end;
276	temp = temp;
277	out->end = out->start;
278	out->start = temp;
279
280	/* reversed genes have an off by one convention */
281	out->end -= 1;
282	out->start -= 1;
283
284	/* now have to reverse transcript. doh! */
285	new_tr = Transcript_alloc_std();
286	temp = out->start - out->end;
287	for(i=tr->ex_len-1;i >= 0;i-- ) {
288	  exon = Exon_alloc_std();
289	  exon->start = temp - tr->exon[i]->end;
290	  exon->end   = temp - tr->exon[i]->start;
291	  fprintf(stderr,"Adding exon %d %d which used to be %d %d\n",exon->start,exon->end,tr->exon[i]->start,tr->exon[i]->end);
292	  add_ex_Transcript(new_tr,exon);
293	}
294	free_Transcript(tr);
295	tr = new_tr;
296      }
297
298      add_Gene(out,tr);
299
300      ts->parent = tr;
301      add_Transcript(tr,ts);
302      tr->parent = out;
303
304    } else {
305      if( trace )
306	fprintf(stderr,"Skipping %s before codon\n",alc->alu[1]->text_label);
307      /* do nothing if not a codon - march on */
308    }
309
310  }
311
312  /*  free_AlnBlock(coll);*/
313
314  return 1;
315}
316
317
318%func
319helper function for new system
320%%
321boolean is_exon_AlnColumn_new(AlnColumn * alc)
322{
323  if( strstr(alc->alu[1]->text_label,"CODON") != NULL ) {
324    return TRUE;
325  } else {
326    return FALSE;
327  }
328
329}
330
331
332%func
333helper function for the new system
334%%
335boolean is_splice_site_AlnColumn(AlnColumn * alc,int * type,int * phase)
336{
337
338  if( strstr(alc->alu[1]->text_label,"SS") != NULL ) {
339
340    if( strstr(alc->alu[1]->text_label,"5SS") != NULL ) {
341      *type = 5;
342    } else {
343      *type = 3;
344    }
345
346    if( strstr(alc->alu[1]->text_label,"0") != NULL ) {
347      *phase = 0;
348    } else if ( strstr(alc->alu[1]->text_label,"1") != NULL ) {
349      *phase = 1;
350    } else if ( strstr(alc->alu[1]->text_label,"2") != NULL ) {
351      *phase = 2;
352    }
353    return TRUE;
354  } else {
355    return FALSE;
356  }
357}
358
359%func
360A new hope for building genes
361%%
362Gene * Gene_from_AlnColumn_new(AlnColumn * alc,AlnColumn ** end)
363{
364  Gene * out;
365  Transcript * tr;
366  Translation * ts;
367  Exon * ex;
368
369  int exon_start;
370  int exon_end;
371  int phase_start;
372  int phase_end;
373
374  int is_reversed;
375  int is_3ss;
376
377  int phase;
378  int type;
379  int tmp;
380
381  AlnColumn * prev;
382
383  boolean end_gene = 0;
384
385  while(alc != NULL && is_exon_AlnColumn_new(alc) == FALSE )
386    alc = alc->next;
387
388  if( alc == NULL ) {
389    *end = NULL;
390    return NULL;
391  }
392
393  if( strstr(alc->alu[1]->text_label,"CODON") == NULL ) {
394    warn("Bad news... exited from random columns, but not in a codon column, in a %s",alc->alu[1]->text_label);
395    * end = NULL;
396    return NULL;
397  }
398
399  /* we should have either a forward or reversed gene */
400
401  if( strstr(alc->alu[1]->text_label,"REV") != NULL ) {
402    is_reversed = 1;
403  } else {
404    is_reversed = 0;
405  }
406
407  out = Gene_alloc_len(1);
408  tr = Transcript_alloc_std();
409  add_Gene(out,tr);
410
411  out->start = alc->alu[1]->start +1;
412
413  phase_start = 0;
414  exon_start  = 0; /* by definition, the first exon is at the start*/
415  is_3ss = 0;
416
417  for(;;) {
418
419    if( strstr(alc->alu[1]->text_label,"GENE_EXIT") != NULL ) {
420      break;
421    }
422
423    exon_start = alc->alu[1]->start+1 - out->start;
424
425    if( is_3ss ) {
426      if( phase_start == 0 ) {
427	exon_start -= 3;
428      } else if ( phase_start == 1 ) {
429	exon_start -= 2;
430      } else if ( phase_start == 2 ) {
431	exon_start -= 1;
432      }
433    }
434
435    /* this is at the start of exon */
436    /* go to the end of the exon */
437    for(; alc != NULL && is_exon_AlnColumn_new(alc);alc=alc->next) {
438      fprintf(stdout,"Exonifying past %d %s\n",alc->alu[1]->start,alc->alu[1]->text_label);
439      prev = alc;
440    }
441
442    /* irregardless of the fate of this exon, we now can put it
443     * in, as if it was phase0-phase0 */
444
445
446    exon_end = prev->alu[1]->end +1 - out->start;
447
448
449    ex = Exon_alloc_std();
450    fprintf(stdout,"Adding exon with %d-%d\n",ex->start,ex->end);
451    add_ex_Transcript(tr,ex);
452
453    ex->start = exon_start;
454    ex->end   = exon_end;
455
456    if( alc != NULL && is_splice_site_AlnColumn(alc,&type,&phase) ) {
457      phase_end = phase;
458    } else {
459      is_3ss = 0;
460      phase_end = 0;
461      end_gene = 1;
462    }
463
464    /* jigging the splice sites for the phases */
465
466    if( phase_end == 0 ) {
467      ex->end = ex->end;
468    } else if( phase_end == 1 ) {
469      ex->end = ex->end +1;
470    } else if( phase_end == 2 ) {
471      ex->end = ex->end +2;
472    }
473
474    ex->phase = phase_start;
475
476    if( end_gene == 1 ) {
477      break;
478    }
479
480    for(; alc != NULL && !is_exon_AlnColumn_new(alc);alc=alc->next) {
481/*      fprintf(stdout,"Moving past %d %s\n",alc->alu[1]->start,alc->alu[1]->text_label);*/
482      if( is_splice_site_AlnColumn(alc,&type,&phase) ) {
483	is_3ss = 1;
484	phase_start = phase;
485      }
486      if( strstr(alc->alu[1]->text_label,"GENE_EXIT") != NULL ) {
487/*	fprintf(stdout,"Exiting %d %s\n",alc->alu[1]->start,alc->alu[1]->text_label); */
488	end_gene = 1;
489	break;
490      }
491
492    }
493
494    if( end_gene == 1 ) {
495      break;
496    }
497
498    if( alc == NULL ) {
499      break;
500    }
501  }
502
503  *end = alc;
504  out->end = ex->end + out->start;
505
506  if( is_reversed == 1 ) {
507    tmp = out->start;
508    out->start = out->end;
509    out->end = tmp;
510  }
511
512
513  ts = Translation_alloc();
514  ts->start = 0;
515  ts->end = length_Transcript(tr);
516  ts->parent = tr;
517  add_Transcript(tr,ts);
518
519  tr->parent = out;
520
521
522  fprintf(stdout,"returning gene %d %d\n",out->start,out->end);
523
524  return out;
525}
526
527%func
528A wrap for making a gene structure from
529an AlnBlock derived from one of the genewise
530methods
531%arg
532alc Alignment column in an AlnBlock produced by genewise
533org_start offset that the genewise alignment was to the coordinate system
534org_end emd that the genewise alignment was to the coordinate system
535end w pointer to a AlnColumn * to say when it has finished with this gene
536%%
537Gene * Gene_from_AlnColumn_GeneWise(AlnColumn * alc,int org_start,int org_end,boolean predict_pseudo_for_frameshift,AlnColumn ** end)
538{
539  Gene * out;
540  Transcript * tr;
541  Translation * ts;
542  Exon * ex;
543  AlnColumn * prev;
544  SupportingFeature * sf;
545  int score = 0;
546  int dosf = 0;
547  int phase = 0;
548  int frame_break = 0;
549
550
551  while(alc != NULL && is_random_AlnColumn_genewise(alc) == TRUE )
552    alc = alc->next;
553
554  while (alc != NULL && strcmp(alc->alu[1]->text_label,"CODON") != 0 )
555    alc = alc->next;
556
557  if( alc == NULL)
558    return NULL;
559
560  out = Gene_alloc_len(1);
561  tr = Transcript_alloc_std();
562  add_Gene(out,tr);
563
564  out->start = alc->alu[1]->start +1;
565
566  score += alc->alu[0]->score[0];
567
568  for(;;) {
569
570/*        fprintf(stderr,"Top loop - alc at %s %s\n",alc->alu[1]->text_label,alc->alu[0]->text_label);  */
571
572    /*    fprintf(stderr,"2 Score is %.2f %s %d %.2f\n",Score2Bits(score),alc->alu[1]->text_label,alc->alu[0]->score[0],Score2Bits(alc->alu[0]->score[0]));*/
573    ex = Exon_alloc_std();
574    add_ex_Transcript(tr,ex);
575
576    /*
577     * this is always the start of an exon
578     */
579
580    dosf = 0;
581
582    if( strcmp(alc->alu[1]->text_label,"CODON") == 0 ) {
583      ex->start = alc->alu[1]->start+1 - out->start; /* coordinated in alignment coords */
584      dosf = 1;
585      phase = 0;
586    } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_0") == 0  ) {
587      ex->start = alc->alu[1]->start +4 - out->start;
588      dosf = 1;
589      phase = 0;
590    } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_1") == 0  ) {
591      ex->start = alc->alu[1]->start +4 - out->start;
592      phase = 1;
593    } else if ( strcmp(alc->alu[1]->text_label,"3SS_PHASE_2") == 0  ) {
594      ex->start = alc->alu[1]->start +4 - out->start;
595      phase = 2;
596    } else {
597      ex->start = alc->alu[1]->start +1 - out->start; /* coordinated in alignment coords */
598      phase = -1;
599    }
600
601    ex->phase = phase;
602
603    /*
604     * Exons can start in INSERTs (yuk). In which case we don't
605     * make a supporting feature
606     */
607
608
609    if( dosf == 1 && strstartcmp(alc->alu[0]->text_label,"INSERT") != 0 ) {
610      sf = SupportingFeature_alloc();
611      /* we fill in start and end from the exon at the moment. Not pretty */
612      sf->hstart  = alc->alu[0]->start+1;
613      sf->hstrand = 1; /* currently only got proteins. Thank the lord! */
614      sf->start = ex->start;
615    } else {
616      sf = NULL; /* make sure we don't generate a sf here */
617    }
618
619
620
621    for(prev=alc,alc=alc->next;alc != NULL; ) {
622     /* fprintf(stderr,"Exon loop - alc at %s %s %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,sf); 	*/
623
624      score += alc->alu[0]->score[0];
625      /*      fprintf(stderr,"1 Score is %.2f %s\n",Score2Bits(score),alc->alu[1]->text_label); */
626      if( is_frameshift_AlnColumn_genewise(alc) == TRUE && predict_pseudo_for_frameshift == TRUE ) {
627	score += alc->alu[0]->score[0];
628	fprintf(stderr,"Score is %.2f\n",Score2Bits(score));
629	out->ispseudo = TRUE;
630	alc = alc->next;
631	continue;
632      }
633
634      if( is_random_AlnColumn_genewise(alc) == TRUE)
635	break;
636
637      if( strcmp(alc->alu[1]->text_label,"CODON") != 0 ) {
638	if( strstartcmp(alc->alu[0]->text_label,"DELETE") == 0 ) {
639	  /* must add sf and start a new one */
640	  /*fprintf(stderr,"Looking at alc at %s %s %d %d %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,prev,out,sf);*/
641
642	  if( sf != NULL ) {
643	    sf->end  = prev->alu[1]->end+1 - out->start;
644	    sf->hend = prev->alu[0]->end+1;
645	    add_Exon(ex,sf);
646	    sf = NULL;
647	  }
648
649	  /*
650	   * go the end of this delete run, which are residues in the query with no
651	   * target info
652	   */
653	  while( alc->next != NULL && strstartcmp(alc->next->alu[0]->text_label,"DELETE") == 0 ) {
654	    alc = alc->next;
655	  }
656	  if( alc->next != NULL && strcmp(alc->next->alu[1]->text_label,"CODON") == 0 ) {
657	    /* the next position is the start of the new alignment */
658	    sf = SupportingFeature_alloc();
659	    sf->hstart = alc->next->alu[0]->start+1;
660	    sf->start  = alc->next->alu[1]->start+1 - out->start;
661	  } else {
662	    sf = NULL;
663	  }
664	} else {
665	  break;
666	}
667
668      } else {
669	/* it is a codon match, but it could be an insert */
670	if( strstartcmp(alc->alu[0]->text_label,"INSERT") == 0) {
671	  /* break at this point, add this supporting feature */
672	  if( sf != NULL ) {
673	    sf->end  = prev->alu[1]->end+1 - out->start;
674	    sf->hend = prev->alu[0]->end+1;
675	    add_Exon(ex,sf);
676	    sf = NULL;
677	  }
678
679	  frame_break = 0;
680
681	  /* go to the end of this insert run, watching for frameshifts */
682	  while( alc->next != NULL && strstartcmp(alc->next->alu[0]->text_label,"INSERT") == 0 ) {
683
684	    if( is_frameshift_AlnColumn_genewise(alc->next) == TRUE || is_random_AlnColumn_genewise(alc->next) == TRUE) {
685
686	      if( is_frameshift_AlnColumn_genewise(alc->next) == TRUE && predict_pseudo_for_frameshift == TRUE ) {
687		out->ispseudo = TRUE;
688		alc = alc->next;
689		continue;
690	      } else {
691		alc = alc->next;
692		frame_break = 1;
693		break;
694	      }
695	    }
696	    alc = alc->next;
697	  }
698	  if( frame_break == 1 ) {
699	    break; /* out of this gene */
700	  }
701
702	  if( alc->next != NULL && strcmp(alc->next->alu[1]->text_label,"CODON") == 0 ) {
703	    /* the next position is the start of the new alignment */
704	    sf = SupportingFeature_alloc();
705	    /* do not understand why not having a +1 here is correct. Hmph */
706	    sf->hstart = alc->next->alu[0]->start+1;
707	    sf->start  = alc->next->alu[1]->start+1 - out->start;
708	  } else {
709	    sf = NULL;
710	  }
711
712	} else {
713	  /* could be the start of a run from INSERT into match */
714	  if( sf == NULL ) {
715	    sf = SupportingFeature_alloc();
716	    /* we fill in start and end from the exon at the moment. Not pretty */
717	    sf->hstart  = alc->alu[0]->start+1;
718	    sf->hstrand = 1; /* currently only got proteins. Thank the lord! */
719	    sf->start = alc->alu[1]->start+1 - out->start;
720	  }
721	}
722      } /* end of else it is a codon match */
723      prev  = alc;
724      alc = alc->next;
725
726    }
727
728    /*
729     * The exon has ended. But why?
730     */
731
732    if( sf != NULL ) {
733      sf->hend = prev->alu[0]->end+1;
734      add_Exon(ex,sf);
735    }
736
737    if( alc == NULL ) {
738      out->end = prev->alu[1]->end +1;
739      ex->end = out->end - out->start;
740      if( sf != NULL ) {
741	sf->end = ex->end;
742      }
743      break;
744    }
745
746    if( is_random_AlnColumn_genewise(alc) == TRUE) {
747      out->end = prev->alu[1]->end +1;
748      ex->end = out->end - out->start;
749      if( sf != NULL ) {
750	sf->end = ex->end;
751      }
752      break;
753    }
754
755
756
757/*    fprintf(stderr,"Exiting out of exon loop...\n");*/
758
759    if( strcmp(alc->alu[1]->text_label,"5SS_PHASE_0") == 0 ) {
760      out->end = alc->alu[1]->start+1;
761      phase = 0;
762    } else if ( strcmp(alc->alu[1]->text_label,"5SS_PHASE_1") == 0 ) {
763      out->end = alc->alu[1]->start+2;
764      phase = 1;
765    } else if ( strcmp(alc->alu[1]->text_label,"5SS_PHASE_2") == 0 ) {
766      out->end = alc->alu[1]->start+3;
767      phase = 2;
768    } else {
769      phase = 0;
770
771      /* fixes from Steve, via an issue with Steve */
772      if ( strncmp(prev->alu[1]->text_label,"3SS_PHASE_0",11) == 0 ) {
773        out->end = prev->alu[1]->end + 4;
774      } else if ( strncmp(prev->alu[1]->text_label,"3SS_PHASE_1",11) == 0 ) {
775        out->end = prev->alu[1]->end + 4;
776      } else if ( strncmp(prev->alu[1]->text_label,"3SS_PHASE_1",11) == 0 ) {
777        out->end = prev->alu[1]->end + 4;
778      } else {
779        out->end = prev->alu[1]->end +1;
780      }
781
782      ex->end = out->end - out->start;
783      if( sf != NULL ) {
784	sf->end = ex->end;
785      }
786      break;
787
788    }
789
790
791    /** set end of exon to the correct size from here **/
792    ex->end = out->end - out->start;
793
794
795    /** sf is to the codon, not to the exon */
796    if( sf != NULL ) {
797      if( phase == 0 ) {
798	sf->end = ex->end;
799      } else if ( phase == 1 ) {
800	sf->end = ex->end-1;
801      } else {
802	sf->end = ex->end-2;
803      }
804    }
805
806
807
808    while( alc != NULL && strstartcmp(alc->alu[1]->text_label,"3SS") != 0 ) {
809/*      fprintf(stderr,"Intron loop - alc at %s %s %d\n",alc->alu[1]->text_label,alc->alu[0]->text_label,sf); 	*/
810      alc = alc->next;
811      score += alc->alu[0]->score[0];
812    }
813
814    if( alc == NULL ) {
815      warn("Got to the end of an alignment inside an intron. Oooops!");
816      break;
817    }
818
819  }/* back to for(;;) */
820
821
822  if( end != NULL )
823    *end = alc;
824
825  if( org_start < org_end) {
826    out->start += org_start-1;
827    out->end   += org_start-1;
828  } else {
829    /*    fprintf(stderr,"was %d to %d\n",out->start,out->end); */
830    out->start = org_start-1 - out->start;
831    out->end   = org_start-1 - out->end;
832    /*    fprintf(stderr,"now %d to %d (%d)\n",out->start,out->end,org_start); */
833  }
834
835  if( out->ispseudo == FALSE ) {
836    ts = Translation_alloc();
837    ts->start = 0;
838    ts->end = length_Transcript(tr);
839    ts->parent = tr;
840    add_Transcript(tr,ts);
841  }
842
843  tr->parent = out;
844
845  out->bits = Score2Bits(score);
846  return out;
847}
848
849
850%func
851This function is to say what is a frameshift label
852%type internal
853%%
854boolean is_frameshift_AlnColumn_genewise(const AlnColumn * alc)
855{
856  if( strcmp(alc->alu[1]->text_label,"SEQUENCE_INSERTION") == 0 ) {
857    return TRUE;
858  }
859  if( strcmp(alc->alu[1]->text_label,"SEQUENCE_DELETION") == 0 ) {
860    return TRUE;
861  }
862  return FALSE;
863}
864
865%func
866This function is to say where this should
867be skipped in alignment/gene prediction problems
868%type internal
869%%
870boolean is_random_AlnColumn_genewise(const AlnColumn * alc)
871{
872  char * la;
873
874  la  = alc->alu[1]->text_label;
875
876  if( strcmp(la,"RANDOM_SEQUENCE") == 0 )
877    return TRUE;
878  if( strcmp(la,"END") == 0 )
879    return TRUE;
880
881  la  = alc->alu[0]->text_label;
882
883  if( strstr(la,"_RND_") != NULL )
884    return TRUE;
885
886  return FALSE;
887}
888
889
890%func
891This function is to say where things are introns
892%type internal
893%%
894boolean is_intron_AlnColumn_genewise(const AlnColumn * alc)
895{
896  char * la;
897
898  la  = alc->alu[1]->text_label;
899
900  if( strcmp(la,"CENTRAL_INTRON") == 0 )
901    return TRUE;
902  if( strcmp(la,"PYRIMIDINE_TRACT") == 0 )
903    return TRUE;
904  if( strcmp(la,"SPACER") == 0 )
905    return TRUE;
906
907  return FALSE;
908}
909
910
911#define GW_EXON_TYPE_UTR5 45
912#define GW_EXON_TYPE_CDS  46
913#define GW_EXON_TYPE_UTR3 47
914#define GW_EXON_TYPE_NONE 48
915
916int  exon_type_AlnColumn_genomewise(AlnColumn * alc)
917{
918  if( strcmp(alc->alu[1]->text_label,"CODON") == 0 ) {
919    return GW_EXON_TYPE_CDS;
920  }
921
922  if( strcmp(alc->alu[1]->text_label,"UTR5") == 0 ) {
923    return GW_EXON_TYPE_UTR5;
924  }
925
926  if( strcmp(alc->alu[1]->text_label,"UTR3") == 0 || strcmp(alc->alu[1]->text_label,"STOP_CODON") == 0) {
927    return GW_EXON_TYPE_UTR3;
928  }
929
930  return GW_EXON_TYPE_NONE;
931}
932
933
934void show_utr_exon_genomewise(AlnBlock * alb,FILE * ofp)
935{
936  AlnColumn * alc;
937  int exon_start;
938  int exon_end;
939  int is_start;
940  int phase;
941  int endphase;
942  int is_3ss;
943
944  for(alc=alb->start;alc != NULL;) {
945    /* find the first exon */
946    for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) != GW_EXON_TYPE_UTR5;alc = alc->next)
947      ;
948    if( alc == NULL ) {
949      break;
950    }
951    fprintf(ofp,"Gene\n");
952
953    if( alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR5 ) {
954      is_start = 1;
955      while( alc != NULL ) { /* while loop goes over all 5UTRs */
956	exon_start = alc->alu[1]->start+ (is_start ? 2 : 3);
957        is_start = 0;
958	for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR5;alc = alc->next ) {
959	  ;
960	}
961        /*	fprintf(stderr,"Broken out with %s\n",alc->alu[1]->text_label); */
962
963	if( strcmp(alc->alu[1]->text_label,"UTR5_INTRON") ==0 ) {
964	  /* ntron. should be +2-1 at the end of this, goes to 1*/
965	  fprintf(ofp,"  utr5 %d %d\n",exon_start,alc->alu[1]->start+1);
966	  /* now loop through the intron */
967	  for(;alc != NULL && strcmp(alc->alu[1]->text_label,"UTR5_INTRON") == 0;alc = alc->next ) {
968	    ;
969	  }
970	  if( alc == NULL || exon_type_AlnColumn_genomewise(alc) != GW_EXON_TYPE_UTR5 ) {
971	    break; /* while loop */
972	  } else{
973	    continue; /* another utr5 exon */
974	  }
975	} else {
976	  /* print this guy and break */
977	  fprintf(ofp,"  utr5 %d %d\n",exon_start,alc->alu[1]->start+1);
978	  break;
979	}
980      }
981    }
982
983
984    /* we now should be at a CDS column */
985
986    if( alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_CDS ) {
987      is_start = 1;
988      while( alc != NULL ) { /* while loop goes over all coding Exons */
989	/* fprintf(stderr,"Entering codoing loop with %s\n",alc->alu[1]->text_label); */
990
991	exon_start = alc->alu[1]->start+2;
992	if( strstr(alc->alu[1]->text_label,"3SS") != NULL ) {
993	  is_3ss = 1;
994	  if( strstr(alc->alu[1]->text_label,"1") != NULL ) {
995	    phase = 1;
996	  } else if ( strstr(alc->alu[1]->text_label,"2") != NULL ) {
997	    phase = 2;
998	  } else {
999	    phase = 0;
1000	  }
1001	  alc = alc->next;
1002	} else {
1003	  is_3ss = 0;
1004	  phase = 0;
1005	}
1006
1007
1008	if( phase == 1 ) {
1009	  exon_start += 3;
1010	} else if ( phase == 2) {
1011	  exon_start += 3;
1012	} else if ( is_3ss ) {
1013	  /* phase 0 and spliced needs adjusting */
1014	  exon_start += 3;
1015	}
1016
1017
1018
1019	for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_CDS ;alc = alc->next ) {
1020	  ;
1021	}
1022
1023	if( strstr(alc->alu[1]->text_label,"5SS") != NULL ) {
1024
1025	  exon_end = alc->alu[1]->start+1;
1026
1027	  if( strstr(alc->alu[1]->text_label,"1") != NULL ) {
1028	    endphase = 1;
1029	  } else if ( strstr(alc->alu[1]->text_label,"2") != NULL ) {
1030	    endphase = 2;
1031	  } else {
1032	    endphase = 0;
1033	  }
1034
1035	  if( endphase == 1 ) {
1036	    exon_end += 1;
1037	  } else if( endphase == 2 ) {
1038	    exon_end += 2;
1039	  } /* no change for phase 0 */
1040
1041	  /* intron. should be +1-1 at the end of this, goes to 0*/
1042	  fprintf(ofp,"  cds %d %d phase %d\n",exon_start,exon_end,phase);
1043
1044	  /* now loop through the intron */
1045	  for(alc= alc->next;alc != NULL && strstr(alc->alu[1]->text_label,"3SS") == NULL;alc = alc->next ) {
1046	    ;
1047	  }
1048	  if( alc == NULL || strstr(alc->alu[1]->text_label,"3SS") == NULL ) {
1049	    break; /* while loop */
1050	  } else{
1051	    continue; /* another cds exon */
1052	  }
1053	} else {
1054	  fprintf(ofp,"  cds %d %d phase %d\n",exon_start,alc->alu[1]->start+1,phase);
1055	  break;
1056	}
1057      }
1058    }
1059
1060
1061    if( alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR3 ) {
1062      is_start = 1;
1063      while( alc != NULL ) { /* while loop goes over all 3UTRs */
1064	exon_start = alc->alu[1]->start+ (is_start ? 2 : 3);
1065        is_start = 0;
1066	for(;alc != NULL && exon_type_AlnColumn_genomewise(alc) == GW_EXON_TYPE_UTR3 ;alc = alc->next ) {
1067	  ;
1068	}
1069
1070
1071	if( strstr(alc->alu[1]->text_label,"INTRON") != NULL ) {
1072	  /* intron. should be +2-1 at the end of this, goes to 1*/
1073	  fprintf(ofp,"  utr3 %d %d\n",exon_start,alc->alu[1]->start+1);
1074	  /* now loop through the intron */
1075	  for(;alc != NULL && strstr(alc->alu[1]->text_label,"INTRON") != NULL;alc = alc->next ) {
1076	    ;
1077	  }
1078	  if( alc == NULL || exon_type_AlnColumn_genomewise(alc) != GW_EXON_TYPE_UTR3 ) {
1079	    break; /* while loop */
1080	  } else{
1081	    continue; /* another utr3 exon */
1082	  }
1083	} else {
1084	  fprintf(ofp,"  utr3 %d %d\n",exon_start,alc->alu[1]->start+1);
1085	  break;
1086	}
1087      }
1088    }
1089
1090    fprintf(ofp,"End\n");
1091    /* back to next gene */
1092
1093  }
1094
1095
1096}
1097
1098
1099%}
1100
1101
1102
1103
1104