1# BioPerl module for Bio::Tools::Run::Alignment::Sim4 2# 3# Please direct questions and support issues to <bioperl-l@bioperl.org> 4# 5# Cared for by 6# 7# Copyright Shawn Hoon 8# 9# You may distribute this module under the same terms as perl itself 10# POD documentation - main docs before the code 11 12=head1 NAME 13 14Bio::Tools::Run::Alignment::Sim4 - Wrapper for Sim4 program that allows 15for alignment of cdna to genomic sequences 16 17=head1 SYNOPSIS 18 19 use Bio::Tools::Run::Alignment::Sim4; 20 21 my @params = (W=>15,K=>17,D=>10,N=>10,cdna_seq=>"mouse_cdna.fa",genomic_seq=>"mouse_genomic.fa"); 22 my $sim4 = Bio::Tools::Run::Alignment::Sim4->new(@params); 23 24 my @exon_sets = $sim4->align; 25 26 foreach my $set(@exon_sets){ 27 foreach my $exon($set->sub_SeqFeature){ 28 print $exon->start."\t".$exon->end."\t".$exon->strand."\n"; 29 print "\tMatched ".$exon->est_hit->seq_id."\t".$exon->est_hit->start."\t".$exon->est_hit->end."\n"; 30 } 31 } 32 33 One can also provide a est database 34 35 $sio = Bio::SeqIO->new(-file=>"est.fa",-format=>"fasta"); 36 @est_seq=(); 37 while(my $seq = $sio->next_seq){ 38 push @est_seq,$seq; 39 } 40 41 my @exon_sets = $factory->align(\@est_seq,$genomic); 42 43=head1 DESCRIPTION 44 45Sim4 program is developed by Florea et al. for aligning cdna/est 46sequence to genomic sequences 47 48Florea L, Hartzell G, Zhang Z, Rubin GM, Miller W. 49A computer program for aligning a cDNA sequence with a genomic DNA sequence. 50Genome Res 1998 Sep;8(9):967-74 51 52The program is available for download here: 53http://globin.cse.psu.edu/ 54 55=head1 FEEDBACK 56 57=head2 Mailing Lists 58 59User feedback is an integral part of the evolution of this and other 60Bioperl modules. Send your comments and suggestions preferably to one 61of the Bioperl mailing lists. Your participation is much appreciated. 62 63 bioperl-l@bioperl.org - General discussion 64 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 65 66=head2 Support 67 68Please direct usage questions or support issues to the mailing list: 69 70I<bioperl-l@bioperl.org> 71 72rather than to the module maintainer directly. Many experienced and 73reponsive experts will be able look at the problem and quickly 74address it. Please include a thorough description of the problem 75with code and data examples if at all possible. 76 77=head2 Reporting Bugs 78 79Report bugs to the Bioperl bug tracking system to help us keep track 80the bugs and their resolution. Bug reports can be submitted via the 81web: 82 83 http://redmine.open-bio.org/projects/bioperl/ 84 85=head1 AUTHOR - Shawn Hoon 86 87Email shawnh@fugu-sg.org 88 89=head1 APPENDIX 90 91The rest of the documentation details each of the object 92methods. Internal methods are usually preceded with a _ 93 94=cut 95 96 97package Bio::Tools::Run::Alignment::Sim4; 98use vars qw($AUTOLOAD @ISA $PROGRAM $PROGRAMDIR $PROGRAMNAME 99 @SIM4_PARAMS @OTHER_PARAMS @OTHER_SWITCHES %OK_FIELD); 100use strict; 101use Bio::SeqIO; 102use Bio::SimpleAlign; 103use Bio::AlignIO; 104use Bio::Root::Root; 105use Bio::Root::IO; 106use Bio::Tools::Run::WrapperBase; 107use Bio::Tools::Sim4::Results; 108 109@ISA = qw(Bio::Root::Root Bio::Tools::Run::WrapperBase); 110 111# You will need to enable Sim4 to find the Sim4 program. This 112# can be done in (at least) two ways: 113# 114# 1. define an environmental variable SIM4DIR 115# export SIM4DIR =/usr/local/share/sim4 116# where the sim4 package is installed 117# 118# 2. include a definition of an environmental variable SIM4 in 119# every script that will use Sim4.pm 120# $ENV{SIMR4DIR} = '/usr/local/share/sim4'; 121 122BEGIN { 123 124 @SIM4_PARAMS= qw(A W X K C R D H P N B); 125 @OTHER_PARAMS= qw(CDNA_SEQ GENOMIC_SEQ OUTFILE); 126 @OTHER_SWITCHES = qw(SILENT QUIET VERBOSE); 127 128 # Authorize attribute fields 129 foreach my $attr ( @SIM4_PARAMS, @OTHER_PARAMS, 130 @OTHER_SWITCHES) { $OK_FIELD{$attr}++; } 131} 132 133=head2 program_name 134 135 Title : program_name 136 Usage : $factory->program_name() 137 Function: holds the program name 138 Returns: string 139 Args : None 140 141=cut 142 143sub program_name { 144 return 'sim4'; 145} 146 147=head2 program_dir 148 149 Title : program_dir 150 Usage : $factory->program_dir(@params) 151 Function: returns the program directory, obtained from ENV variable. 152 Returns: string 153 Args : 154 155=cut 156 157sub program_dir { 158 return Bio::Root::IO->catfile($ENV{SIM4DIR}) if $ENV{SIM4DIR}; 159} 160 161sub new { 162 my ($class, @args) = @_; 163 my $self = $class->SUPER::new(@args); 164 # to facilitiate tempfile cleanup 165 $self->io->_initialize_io(); 166 $self->A(0); # default 167 my ($attr, $value); 168 169 while (@args) { 170 $attr = shift @args; 171 $value = shift @args; 172 if ($attr =~/est_first/i ) { #NEW 173 $self->{est_first} = $value; #NEW 174 next; #NEW 175 } #NEW 176 next if( $attr =~ /^-/ ); # don't want named parameters 177 if ($attr =~/'PROGRAM'/i ) { 178 $self->executable($value); 179 next; 180 } 181 $self->$attr($value); 182 } 183 return $self; 184} 185 186sub AUTOLOAD { 187 my $self = shift; 188 my $attr = $AUTOLOAD; 189 $attr =~ s/.*:://; 190 $attr = uc $attr; 191 $self->throw("Unallowed parameter: $attr !") unless $OK_FIELD{$attr}; 192 $self->{$attr} = shift if @_; 193 return $self->{$attr}; 194} 195 196=head2 version 197 198 Title : version 199 Usage : not supported 200 Function: Cannot determine from program 201 Example : 202 Returns : float or undef 203 Args : none 204 205=cut 206 207sub version { 208 my ($self) = @_; 209 return undef; 210} 211 212=head2 align 213 214 Title : align 215 Usage : 216 $cdna = 't/data/cdna.fa'; 217 $genomic = 't/data/cdna.fa'; 218 @exon_set = $factory->align($cdna,$genomic); 219 or 220 #@seq_array is array of Seq objs 221 $cdna = \@seq_array; 222 @exon_set = $factory->align($cdna,$genomic); 223 of 224 @exon_set = $factory->align($cdna->[0],$genomic) 225 226 Function: Perform a Sim4 alignment 227 Returns : An array of Bio::SeqFeature::Generic objects which has 228 exons as sub seqfeatures. 229 Args : Name of two files containing fasta sequences, 230 or 2 Bio::SeqI objects 231 or a combination of both 232 first is assumed to be cdna 233 second is assumed to be genomic 234 More than one cdna may be provided. If an object, 235 assume that its an array ref. 236 237=cut 238 239sub align { 240 241 my ($self,$cdna,$genomic) = @_; 242 243 $self->cdna_seq($cdna) if $cdna; 244 $self->throw("Need to provide a cdna sequence") unless $self->cdna_seq; 245 246 $self->genomic_seq($genomic) if $genomic; 247 $self->throw("Need to provide a genomic sequence") unless $self->genomic_seq; 248 249 my ($temp,$infile1, $infile2, $est_first,$seq); 250 my ($attr, $value, $switch); 251 252# Create input file pointer 253 ($est_first,$infile1,$infile2)= $self->_setinput($self->cdna_seq,$self->genomic_seq); 254 if (!($infile1 && $infile2)) {$self->throw("Bad input data (sequences need an id ) or less than 2 sequences in align!");} 255 256# Create parameter string to pass to Sim4 program 257 my $param_string = $self->_setparams(); 258 259# run Sim4 260 my @exon_sets = $self->_run($est_first,$infile1,$infile2,$param_string); 261 return @exon_sets; 262} 263 264################################################# 265#internal methods 266 267=head2 _run 268 269 Title : _run 270 Usage : Internal function, not to be called directly 271 Function: makes actual system call to Sim4 program 272 Example : 273 Returns : nothing; Sim4 output is written to a temp file 274 Args : Name of a file containing a set of unaligned fasta sequences 275 and hash of parameters to be passed to Sim4 276 277=cut 278 279sub _run { 280 my ($self,$estfirst,$infile1,$infile2,$param_string) = @_; 281 my $instring; 282 $self->debug( "Program ".$self->executable."\n"); 283 if(! $self->outfile){ 284 my ($tfh, $outfile) = $self->io->tempfile(-dir=>$self->tempdir); 285 close($tfh); 286 undef $tfh; 287 $self->outfile($outfile); 288 } 289 my $outfile = $self->outfile(); 290 my $commandstring = $self->executable." $infile1 $infile2 $param_string > $outfile"; 291 if($self->quiet || $self->silent || ($self->verbose < 0)){ 292 my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; 293 $commandstring .= " 2>$null"; 294 } 295 $self->debug( "Sim4 command = $commandstring"); 296 my $status = system($commandstring); 297 $self->throw( "Sim4 call ($commandstring) crashed: $? \n") unless $status==0; 298 299 #use Sim4 parser 300 my $sim4_parser = Bio::Tools::Sim4::Results->new(-file=>$outfile,-estfirst=>$estfirst); 301 my @out; 302 while(my $exonset = $sim4_parser->next_exonset){ 303 push @out, $exonset; 304 } 305 return @out; 306} 307 308=head2 _setinput() 309 310 Title : _setinput 311 Usage : Internal function, not to be called directly 312 Function: Create input file for Sim4 program 313 Example : 314 Returns : name of file containing Sim4 data input 315 Args : Seq or Align object reference or input file name 316 317=cut 318 319sub _setinput { 320 my ($self, $cdna,$genomic) = @_; 321 my ($infilename, $seq, $temp, $tfh1,$tfh2,$outfile1,$outfile2); 322 #my $estfirst=1; 323 my $estfirst= defined($self->{est_first}) ? $self->{_est_first} : 1; 324 my ($cdna_file,$genomic_file); 325 #a sequence obj 326 if(ref($cdna)) { 327 my @cdna = ref $cdna eq "ARRAY" ? @{$cdna} : ($cdna); 328 ($tfh1,$cdna_file) = $self->io->tempfile(-dir=>$self->tempdir); 329 my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta'); 330 foreach my $c (@cdna){ 331 $seqio->write_seq($c); 332 } 333 close $tfh1; 334 undef $tfh1; 335 336 #if we have a est database, then input will go second 337 if($#cdna > 0){ 338 $estfirst=0; 339 } 340 } 341 else { 342 my $sio = Bio::SeqIO->new(-file=>$cdna,-format=>"fasta"); 343 my $count = 0; 344 while(my $seq = $sio->next_seq){ 345 $count++; 346 } 347 $estfirst = $count > 1 ? 0:1; 348 $cdna_file = $cdna; 349 } 350 if( ref($genomic) ) { 351 ($tfh1,$genomic_file) = $self->io->tempfile(-dir=>$self->tempdir); 352 my $seqio = Bio::SeqIO->new(-fh=>$tfh1,-format=>'fasta'); 353 $seqio->write_seq($genomic); 354 close $tfh1; 355 undef $tfh1; 356 } 357 else { 358 $genomic_file = $genomic; 359 } 360 return ($estfirst,$cdna_file,$genomic_file) if $estfirst; 361 return ($estfirst,$genomic_file,$cdna_file); 362} 363 364 365=head2 _setparams() 366 367 Title : _setparams 368 Usage : Internal function, not to be called directly 369 Function: Create parameter inputs for Sim4 program 370 Example : 371 Returns : parameter string to be passed to Sim4 372 during align or profile_align 373 Args : name of calling object 374 375=cut 376 377sub _setparams { 378 my ($attr, $value, $self); 379 380 $self = shift; 381 382 my $param_string = ""; 383 for $attr ( @SIM4_PARAMS ) { 384 $value = $self->$attr(); 385 next unless (defined $value); 386 my $attr_key = uc $attr; #put params in format expected by Sim4 387 $attr_key = ' '.$attr_key; 388 $param_string .= $attr_key.'='.$value; 389 } 390 391 return $param_string; 392} 393 3941; # Needed to keep compiler happy 395