1# BioPerl module for Bio::Tools::Run::Alignment::Sim4
2#
3# Please direct questions and support issues to <bioperl-l@bioperl.org>
4#
5# Cared for by
6#
7# Copyright Shawn Hoon
8#
9# You may distribute this module under the same terms as perl itself
10# POD documentation - main docs before the code
11
12=head1 NAME
13
14Bio::Tools::Run::Alignment::Sim4 - Wrapper for Sim4 program that allows
15for alignment of cdna to genomic sequences
16
17=head1 SYNOPSIS
18
19  use Bio::Tools::Run::Alignment::Sim4;
20
21  my @params = (W=>15,K=>17,D=>10,N=>10,cdna_seq=>"mouse_cdna.fa",genomic_seq=>"mouse_genomic.fa");
22  my $sim4 = Bio::Tools::Run::Alignment::Sim4->new(@params);
23
24  my @exon_sets = $sim4->align;
25
26  foreach my $set(@exon_sets){
27    foreach my $exon($set->sub_SeqFeature){
28        print $exon->start."\t".$exon->end."\t".$exon->strand."\n";
29        print "\tMatched ".$exon->est_hit->seq_id."\t".$exon->est_hit->start."\t".$exon->est_hit->end."\n";
30    }
31  }
32
33  One can also provide a est database
34
35 $sio = Bio::SeqIO->new(-file=>"est.fa",-format=>"fasta");
36 @est_seq=();
37 while(my $seq = $sio->next_seq){
38         push @est_seq,$seq;
39 }
40
41 my @exon_sets = $factory->align(\@est_seq,$genomic);
42
43=head1 DESCRIPTION
44
45Sim4 program is developed by Florea et al. for aligning cdna/est
46sequence to genomic sequences
47
48Florea L, Hartzell G, Zhang Z, Rubin GM, Miller W.
49A computer program for aligning a cDNA sequence with a genomic DNA sequence.
50Genome Res 1998 Sep;8(9):967-74
51
52The program is available for download here:
53http://globin.cse.psu.edu/
54
55=head1 FEEDBACK
56
57=head2 Mailing Lists
58
59User feedback is an integral part of the evolution of this and other
60Bioperl modules. Send your comments and suggestions preferably to one
61of the Bioperl mailing lists.  Your participation is much appreciated.
62
63  bioperl-l@bioperl.org                  - General discussion
64  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
65
66=head2 Support
67
68Please direct usage questions or support issues to the mailing list:
69
70I<bioperl-l@bioperl.org>
71
72rather than to the module maintainer directly. Many experienced and
73reponsive experts will be able look at the problem and quickly
74address it. Please include a thorough description of the problem
75with code and data examples if at all possible.
76
77=head2 Reporting Bugs
78
79Report bugs to the Bioperl bug tracking system to help us keep track
80the bugs and their resolution.  Bug reports can be submitted via the
81web:
82
83  http://redmine.open-bio.org/projects/bioperl/
84
85=head1 AUTHOR - Shawn Hoon
86
87Email shawnh@fugu-sg.org
88
89=head1 APPENDIX
90
91The rest of the documentation details each of the object
92methods. Internal methods are usually preceded with a _
93
94=cut
95
96
97package Bio::Tools::Run::Alignment::Sim4;
98use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME
99            @SIM4_PARAMS @OTHER_PARAMS @OTHER_SWITCHES %OK_FIELD);
100use strict;
101use Bio::SeqIO;
102use Bio::SimpleAlign;
103use Bio::AlignIO;
104use Bio::Root::Root;
105use Bio::Root::IO;
106use Bio::Tools::Run::WrapperBase;
107use Bio::Tools::Sim4::Results;
108
109@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase);
110
111# You will need to enable Sim4 to find the Sim4 program. This
112# can be done in (at least) two ways:
113#
114# 1. define an environmental variable SIM4DIR
115# export SIM4DIR =/usr/local/share/sim4
116# where the sim4 package is installed
117#
118# 2. include a definition of an environmental variable SIM4 in
119# every script that will use Sim4.pm
120# $ENV{SIMR4DIR} = '/usr/local/share/sim4';
121
122BEGIN {
123
124    @SIM4_PARAMS= qw(A W X K C R D H P N B);
125    @OTHER_PARAMS= qw(CDNA_SEQ GENOMIC_SEQ OUTFILE);
126    @OTHER_SWITCHES = qw(SILENT QUIET VERBOSE);
127
128    # Authorize attribute fields
129    foreach my $attr ( @SIM4_PARAMS, @OTHER_PARAMS,
130                       @OTHER_SWITCHES) { $OK_FIELD{$attr}++; }
131}
132
133=head2 program_name
134
135 Title   : program_name
136 Usage   : $factory->program_name()
137 Function: holds the program name
138 Returns:  string
139 Args    : None
140
141=cut
142
143sub program_name {
144  return 'sim4';
145}
146
147=head2 program_dir
148
149 Title   : program_dir
150 Usage   : $factory->program_dir(@params)
151 Function: returns the program directory, obtained from ENV variable.
152 Returns:  string
153 Args    :
154
155=cut
156
157sub program_dir {
158  return Bio::Root::IO->catfile($ENV{SIM4DIR}) if $ENV{SIM4DIR};
159}
160
161sub new {
162  my ($class, @args) = @_;
163  my $self = $class->SUPER::new(@args);
164  # to facilitiate tempfile cleanup
165  $self->io->_initialize_io();
166  $self->A(0); # default
167  my ($attr, $value);
168
169  while (@args) {
170    $attr =   shift @args;
171    $value =  shift @args;
172    if ($attr =~/est_first/i )  {      #NEW
173       $self->{est_first} = $value;    #NEW
174       next;                           #NEW
175    }                                  #NEW
176    next if( $attr =~ /^-/ ); # don't want named parameters
177    if ($attr =~/'PROGRAM'/i )  {
178      $self->executable($value);
179      next;
180    }
181    $self->$attr($value);
182  }
183  return $self;
184}
185
186sub AUTOLOAD {
187    my $self = shift;
188    my $attr = $AUTOLOAD;
189    $attr =~ s/.*:://;
190    $attr = uc $attr;
191    $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr};
192    $self->{$attr} = shift if @_;
193    return $self->{$attr};
194}
195
196=head2  version
197
198 Title   : version
199 Usage   : not supported
200 Function: Cannot determine from program
201 Example :
202 Returns : float or undef
203 Args    : none
204
205=cut
206
207sub version {
208    my ($self) = @_;
209    return undef;
210}
211
212=head2  align
213
214 Title   : align
215 Usage   :
216            $cdna = 't/data/cdna.fa';
217            $genomic = 't/data/cdna.fa';
218            @exon_set = $factory->align($cdna,$genomic);
219          or
220            #@seq_array is array of Seq objs
221            $cdna = \@seq_array;
222            @exon_set = $factory->align($cdna,$genomic);
223          of
224            @exon_set = $factory->align($cdna->[0],$genomic)
225
226 Function: Perform a Sim4  alignment
227 Returns : An array of Bio::SeqFeature::Generic objects which has
228           exons as sub seqfeatures.
229 Args    : Name of two files containing fasta sequences,
230           or 2 Bio::SeqI objects
231           or a combination of both
232           first is assumed to be cdna
233           second is assumed to be genomic
234           More than one cdna may be provided. If an object,
235           assume that its an array ref.
236
237=cut
238
239sub align {
240
241    my ($self,$cdna,$genomic) = @_;
242
243    $self->cdna_seq($cdna) if $cdna;
244    $self->throw("Need to provide a cdna sequence") unless $self->cdna_seq;
245
246    $self->genomic_seq($genomic) if $genomic;
247    $self->throw("Need to provide a genomic sequence") unless $self->genomic_seq;
248
249    my ($temp,$infile1, $infile2, $est_first,$seq);
250    my ($attr, $value, $switch);
251
252# Create input file pointer
253    ($est_first,$infile1,$infile2)= $self->_setinput($self->cdna_seq,$self->genomic_seq);
254    if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in align!");}
255
256# Create parameter string to pass to Sim4 program
257    my $param_string = $self->_setparams();
258
259# run Sim4
260    my @exon_sets = $self->_run($est_first,$infile1,$infile2,$param_string);
261    return @exon_sets;
262}
263
264#################################################
265#internal methods
266
267=head2  _run
268
269 Title   :  _run
270 Usage   :  Internal function, not to be called directly
271 Function:   makes actual system call to Sim4 program
272 Example :
273 Returns : nothing; Sim4  output is written to a temp file
274 Args    : Name of a file containing a set of unaligned fasta sequences
275           and hash of parameters to be passed to Sim4
276
277=cut
278
279sub _run {
280    my ($self,$estfirst,$infile1,$infile2,$param_string) = @_;
281    my $instring;
282    $self->debug( "Program ".$self->executable."\n");
283    if(! $self->outfile){
284        my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir);
285	close($tfh);
286	undef $tfh;
287        $self->outfile($outfile);
288    }
289    my $outfile = $self->outfile();
290    my $commandstring = $self->executable." $infile1 $infile2 $param_string > $outfile";
291    if($self->quiet || $self->silent || ($self->verbose < 0)){
292      my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
293      $commandstring .= " 2>$null";
294    }
295    $self->debug( "Sim4 command = $commandstring");
296    my $status = system($commandstring);
297    $self->throw( "Sim4 call ($commandstring) crashed: $? \n") unless $status==0;
298
299    #use Sim4 parser
300    my $sim4_parser = Bio::Tools::Sim4::Results->new(-file=>$outfile,-estfirst=>$estfirst);
301    my @out;
302    while(my $exonset = $sim4_parser->next_exonset){
303        push @out, $exonset;
304    }
305    return @out;
306}
307
308=head2  _setinput()
309
310 Title   :  _setinput
311 Usage   :  Internal function, not to be called directly
312 Function:   Create input file for Sim4 program
313 Example :
314 Returns : name of file containing Sim4 data input
315 Args    : Seq or Align object reference or input file name
316
317=cut
318
319sub _setinput {
320  my ($self, $cdna,$genomic) = @_;
321  my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2);
322  #my $estfirst=1;
323  my $estfirst= defined($self->{est_first}) ? $self->{_est_first} : 1;
324  my ($cdna_file,$genomic_file);
325  #a sequence obj
326  if(ref($cdna)) {
327      my @cdna = ref $cdna eq "ARRAY" ? @{$cdna} : ($cdna);
328      ($tfh1,$cdna_file) = $self->io->tempfile(-dir=>$self->tempdir);
329      my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta');
330      foreach my $c (@cdna){
331	  $seqio->write_seq($c);
332      }
333      close $tfh1;
334      undef $tfh1;
335
336      #if we have a est database, then input will  go second
337      if($#cdna > 0){
338	  $estfirst=0;
339      }
340  }
341  else {
342     my $sio = Bio::SeqIO->new(-file=>$cdna,-format=>"fasta");
343     my $count = 0;
344     while(my $seq = $sio->next_seq){
345         $count++;
346     }
347     $estfirst = $count > 1 ? 0:1;
348     $cdna_file = $cdna;
349  }
350  if( ref($genomic) ) {
351      ($tfh1,$genomic_file) = $self->io->tempfile(-dir=>$self->tempdir);
352      my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta');
353      $seqio->write_seq($genomic);
354      close $tfh1;
355      undef $tfh1;
356  }
357  else {
358      $genomic_file = $genomic;
359  }
360  return ($estfirst,$cdna_file,$genomic_file) if $estfirst;
361  return ($estfirst,$genomic_file,$cdna_file);
362}
363
364
365=head2  _setparams()
366
367 Title   :  _setparams
368 Usage   :  Internal function, not to be called directly
369 Function:   Create parameter inputs for Sim4 program
370 Example :
371 Returns : parameter string to be passed to Sim4
372           during align or profile_align
373 Args    : name of calling object
374
375=cut
376
377sub _setparams {
378    my ($attr, $value, $self);
379
380    $self = shift;
381
382    my $param_string = "";
383    for  $attr ( @SIM4_PARAMS ) {
384      $value = $self->$attr();
385      next unless (defined $value);
386      my $attr_key = uc $attr; #put params in format expected by Sim4
387      $attr_key = ' '.$attr_key;
388      $param_string .= $attr_key.'='.$value;
389    }
390
391    return $param_string;
392}
393
3941; # Needed to keep compiler happy
395