1# 2# BioPerl module for Bio::Tools::Run::MCS 3# 4# Please direct questions and support issues to <bioperl-l@bioperl.org> 5# 6# Cared for by Sendu Bala <bix@sendu.me.uk> 7# 8# Copyright Sendu Bala 9# 10# You may distribute this module under the same terms as perl itself 11 12# POD documentation - main docs before the code 13 14=head1 NAME 15 16Bio::Tools::Run::MCS - Wrapper for MCS 17 18=head1 SYNOPSIS 19 20 use Bio::Tools::Run::MCS; 21 22 # Make a MCS factory 23 $factory = Bio::Tools::Run::MCS->new(); 24 25 # Run MCS on an alignment 26 my @results = $factory->run($alignfilename); 27 28 # or with alignment object 29 @results = $factory->run($bio_simplalign); 30 31 # look at the results 32 foreach my $feat (@results) { 33 my $seq_id = $feat->seq_id; 34 my $start = $feat->start; 35 my $end = $feat->end; 36 my $score = $feat->score; 37 my ($pvalue) = $feat->get_tag_values('pvalue'); 38 my ($kind) = $feat->get_tag_values('kind'); # 'all', 'exon' or 'nonexon' 39 } 40 41=head1 DESCRIPTION 42 43This is a wrapper for running the MCS (binCons) scripts by Elliott H Margulies. 44You can get details here: http://zoo.nhgri.nih.gov/elliott/mcs_doc/. MCS is used 45for the prediciton of transcription factor binding sites and other regions of 46the genome conserved amongst different species. 47 48Note that this wrapper assumes you already have alignments, so only uses MCS 49for the latter stages (the stages involving align2binomial.pl, 50generate_phyloMAX_score.pl and generate_mcs_beta.pl). 51 52You can try supplying normal MCS command-line arguments to new(), eg. 53 54 $factory->new(-percentile => 95) 55 56or calling arg-named methods (excluding the initial 57hyphens, eg. 58 59 $factory->percentile(95) 60 61 to set the --percentile arg). 62 63 64You will need to enable this MCS wrapper to find the MCS scripts. 65This can be done in (at least) three ways: 66 67 1. Make sure the MCS scripts are in your path. 68 2. Define an environmental variable MCSDIR which is a 69 directory which contains the MCS scripts: 70 In bash: 71 72 export MCSDIR=/home/username/mcs/ 73 74 In csh/tcsh: 75 76 setenv MCSDIR /home/username/mcs 77 78 3. Include a definition of an environmental variable MCSDIR in 79 every script that will use this MCS wrapper module, e.g.: 80 81 BEGIN { $ENV{MCSDIR} = '/home/username/mcs/' } 82 use Bio::Tools::Run::MCS; 83 84=head1 FEEDBACK 85 86=head2 Mailing Lists 87 88User feedback is an integral part of the evolution of this and other 89Bioperl modules. Send your comments and suggestions preferably to 90the Bioperl mailing list. Your participation is much appreciated. 91 92 bioperl-l@bioperl.org - General discussion 93 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 94 95=head2 Support 96 97Please direct usage questions or support issues to the mailing list: 98 99I<bioperl-l@bioperl.org> 100 101rather than to the module maintainer directly. Many experienced and 102reponsive experts will be able look at the problem and quickly 103address it. Please include a thorough description of the problem 104with code and data examples if at all possible. 105 106=head2 Reporting Bugs 107 108Report bugs to the Bioperl bug tracking system to help us keep track 109of the bugs and their resolution. Bug reports can be submitted via 110the web: 111 112 http://redmine.open-bio.org/projects/bioperl/ 113 114=head1 AUTHOR - Sendu Bala 115 116Email bix@sendu.me.uk 117 118=head1 APPENDIX 119 120The rest of the documentation details each of the object methods. 121Internal methods are usually preceded with a _ 122 123=cut 124 125package Bio::Tools::Run::MCS; 126use strict; 127 128use Cwd; 129use File::Spec; 130use Bio::AlignIO; 131use Bio::FeatureIO; 132use Bio::Annotation::SimpleValue; 133 134use base qw(Bio::Tools::Run::Phylo::PhyloBase); 135 136our $PROGRAM_NAME = 'align2binomial.pl'; 137our $PROGRAM_DIR; 138 139# methods for the mcs args we support 140our @PARAMS = qw(neutral percentile mcs specificity sensitivity name); 141our @SWITCHES = qw(neg-score); 142 143# just to be explicit, args we don't support (yet) or we handle ourselves 144our @UNSUPPORTED = qw(ucsc gtf neutral-only fourd-align align-only ar); 145 146BEGIN { 147 # lets add all the mcs scripts to the path so that when we call 148 # align2binomial.pl it can find its siblings 149 $PROGRAM_DIR = $ENV{'MCSDIR'}; 150 $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR; 151} 152 153=head2 program_name 154 155 Title : program_name 156 Usage : $factory>program_name() 157 Function: holds the program name 158 Returns : string 159 Args : None 160 161=cut 162 163sub program_name { 164 return $PROGRAM_NAME; 165} 166 167=head2 program_dir 168 169 Title : program_dir 170 Usage : $factory->program_dir(@params) 171 Function: returns the program directory, obtained from ENV variable. 172 Returns : string 173 Args : None 174 175=cut 176 177sub program_dir { 178 return $PROGRAM_DIR; 179} 180 181=head2 new 182 183 Title : new 184 Usage : $factory = Bio::Tools::Run::MCS->new() 185 Function: creates a new MCS factory 186 Returns : Bio::Tools::Run::MCS 187 Args : Many options understood by MCS can be supplied as key => 188 value pairs. 189 190 These options can NOT be used with this wrapper: 191 ucsc gtf neutral-only fourd-align align-only ar 192 193=cut 194 195sub new { 196 my ($class, @args) = @_; 197 my $self = $class->SUPER::new(@args); 198 199 $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'], 200 -create => 1); 201 202 return $self; 203} 204 205=head2 run 206 207 Title : run 208 Usage : $result = $factory->run($align_file_or_object, Bio::Location::Atomic, 209 [Bio::SeqFeatureI]); 210 Function: Runs the MCS scripts on an alignment. 211 Returns : list of Bio::SeqFeatureI feature objects (with coordinates corrected 212 according to the supplied offset, if any) 213 Args : The first argument represents an alignment, the optional second 214 argument represents the chromosome, stand and end and the optional 215 third argument represents annotation of the exons in the alignment. 216 217 The alignment can be provided as a multi-fasta format alignment 218 filename, or a Bio::Align::AlignI compliant object (eg. a 219 Bio::SimpleAlign). 220 221 The position in the genome can be provided as a Bio::Location::Atomic 222 with start, end and seq_id set. 223 224 The annnotation can be provided as an array of Bio::SeqFeatureI 225 objects. 226 227=cut 228 229sub run { 230 my ($self, $aln, $offset, $exon_feats) = @_; 231 $self->_alignment($aln || $self->alignment || $self->throw("An alignment must be supplied")); 232 233 return $self->_run($offset, $exon_feats); 234} 235 236sub _run { 237 my ($self, $atomic, $exon_feats) = @_; 238 239 my $exe = $self->executable || return; 240 241 # cd to a temp dir 242 my $temp_dir = $self->tempdir; 243 my $cwd = Cwd->cwd(); 244 chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'"); 245 246 my $offset = ''; 247 my $start_adjust = 0; 248 if ($atomic) { 249 $start_adjust = $atomic->start; 250 $offset = '--ucsc '.$atomic->seq_id.':'.$start_adjust.'-'.$atomic->end; 251 $start_adjust--; 252 } 253 254 my $gtf_file = 'exons.gtf'; 255 if ($exon_feats) { 256 my $fout = Bio::FeatureIO->new(-file => ">$gtf_file", -format => 'gtf'); 257 foreach my $feat (@{$exon_feats}) { 258 $fout->write_feature($feat); 259 } 260 } 261 my $gtf = $exon_feats ? "--gtf $gtf_file" : ''; 262 263 # step '2' (http://zoo.nhgri.nih.gov/elliott/mcs_doc/node1.html) of MCS: 264 # run align2binomial.pl to calculate individual species binomial scores 265 my $aln_file = $self->_write_alignment; 266 my $error_file = 'stderr'; 267 my $binomial_file = 'align_name.binomial'; 268 my $cmd = "align2binomial.pl $offset $gtf $aln_file > $binomial_file 2> $error_file"; 269 #system("rm -fr $cwd/mcs_dir; cp -R $temp_dir $cwd/mcs_dir"); 270 my $throw = system($cmd); 271 open(my $efh, "<", $error_file) || $self->throw("Could not open error file '$error_file'"); 272 my $error; 273 while (<$efh>) { 274 $error .= $_; 275 $throw = 1 if /not divisible by 3/; 276 } 277 close($efh); 278 $self->throw($error) if $throw; 279 280 # step '3': run generate_phyloMAX_score.pl to combine the individual 281 # binomial scores and generate the final Multi-species Conservation Score 282 my $phylo_file = 'align_name.phylo'; 283 system("generate_phyloMAX_score.pl $binomial_file > $phylo_file") && $self->throw("generate_phyloMAX_score.pl call failed: $?, $!"); 284 285 # step '4': Generate MCSs from the conservation score using 286 # generate_mcs_beta.pl 287 my $mcs_file = 'mcs_result.stdout'; 288 my $bed_file = 'align_name.bed'; # hardcoded in generate_mcs_beta.pl 289 system("generate_mcs_beta.pl $offset $gtf $phylo_file > $mcs_file") && $self->throw("generate_mcs_beta.pl failed: $?, $!"); 290 291 my @feats; 292 my $fin = Bio::FeatureIO->new(-file => $bed_file, -format => 'bed'); 293 my $source = Bio::Annotation::SimpleValue->new(-value => 'MCS'); 294 while (my $feat = $fin->next_feature()) { 295 # convert coords given offset 296 if ($start_adjust) { 297 $feat->start($feat->start + $start_adjust); 298 $feat->end($feat->end + $start_adjust); 299 } 300 $feat->source($source); 301 push(@feats, $feat); 302 } 303 304 # cd back again 305 chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'"); 306 307 return @feats; 308} 309 310=head2 _setparams 311 312 Title : _setparams 313 Usage : Internal function, not to be called directly 314 Function: Creates a string of params to be used in the command string 315 Returns : string of params 316 Args : none 317 318=cut 319 320sub _setparams { 321 my $self = shift; 322 323 my $param_string = $self->SUPER::_setparams(-params => \@PARAMS, 324 -switches => \@SWITCHES, 325 -dash => 1); 326 327 my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null'; 328 $param_string .= " 1>$null" if $self->quiet; 329 330 return $param_string; 331} 332 3331; 334