1#
2# BioPerl module for Bio::Tools::Run::MCS
3#
4# Please direct questions and support issues to <bioperl-l@bioperl.org>
5#
6# Cared for by Sendu Bala <bix@sendu.me.uk>
7#
8# Copyright Sendu Bala
9#
10# You may distribute this module under the same terms as perl itself
11
12# POD documentation - main docs before the code
13
14=head1 NAME
15
16Bio::Tools::Run::MCS - Wrapper for MCS
17
18=head1 SYNOPSIS
19
20  use Bio::Tools::Run::MCS;
21
22  # Make a MCS factory
23  $factory = Bio::Tools::Run::MCS->new();
24
25  # Run MCS on an alignment
26  my @results = $factory->run($alignfilename);
27
28  # or with alignment object
29  @results = $factory->run($bio_simplalign);
30
31  # look at the results
32  foreach my $feat (@results) {
33    my $seq_id = $feat->seq_id;
34    my $start = $feat->start;
35    my $end = $feat->end;
36    my $score = $feat->score;
37    my ($pvalue) = $feat->get_tag_values('pvalue');
38    my ($kind) = $feat->get_tag_values('kind'); # 'all', 'exon' or 'nonexon'
39  }
40
41=head1 DESCRIPTION
42
43This is a wrapper for running the MCS (binCons) scripts by Elliott H Margulies.
44You can get details here: http://zoo.nhgri.nih.gov/elliott/mcs_doc/. MCS is used
45for the prediciton of transcription factor binding sites and other regions of
46the genome conserved amongst different species.
47
48Note that this wrapper assumes you already have alignments, so only uses MCS
49for the latter stages (the stages involving align2binomial.pl,
50generate_phyloMAX_score.pl and generate_mcs_beta.pl).
51
52You can try supplying normal MCS command-line arguments to new(), eg.
53
54  $factory->new(-percentile => 95)
55
56or calling arg-named methods (excluding the initial
57hyphens, eg.
58
59  $factory->percentile(95)
60
61 to set the --percentile arg).
62
63
64You will need to enable this MCS wrapper to find the MCS scripts.
65This can be done in (at least) three ways:
66
67 1. Make sure the MCS scripts are in your path.
68 2. Define an environmental variable MCSDIR which is a
69    directory which contains the MCS scripts:
70    In bash:
71
72    export MCSDIR=/home/username/mcs/
73
74    In csh/tcsh:
75
76    setenv MCSDIR /home/username/mcs
77
78 3. Include a definition of an environmental variable MCSDIR in
79    every script that will use this MCS wrapper module, e.g.:
80
81    BEGIN { $ENV{MCSDIR} = '/home/username/mcs/' }
82    use Bio::Tools::Run::MCS;
83
84=head1 FEEDBACK
85
86=head2 Mailing Lists
87
88User feedback is an integral part of the evolution of this and other
89Bioperl modules. Send your comments and suggestions preferably to
90the Bioperl mailing list.  Your participation is much appreciated.
91
92  bioperl-l@bioperl.org                  - General discussion
93  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
94
95=head2 Support
96
97Please direct usage questions or support issues to the mailing list:
98
99I<bioperl-l@bioperl.org>
100
101rather than to the module maintainer directly. Many experienced and
102reponsive experts will be able look at the problem and quickly
103address it. Please include a thorough description of the problem
104with code and data examples if at all possible.
105
106=head2 Reporting Bugs
107
108Report bugs to the Bioperl bug tracking system to help us keep track
109of the bugs and their resolution. Bug reports can be submitted via
110the web:
111
112  http://redmine.open-bio.org/projects/bioperl/
113
114=head1 AUTHOR - Sendu Bala
115
116Email bix@sendu.me.uk
117
118=head1 APPENDIX
119
120The rest of the documentation details each of the object methods.
121Internal methods are usually preceded with a _
122
123=cut
124
125package Bio::Tools::Run::MCS;
126use strict;
127
128use Cwd;
129use File::Spec;
130use Bio::AlignIO;
131use Bio::FeatureIO;
132use Bio::Annotation::SimpleValue;
133
134use base qw(Bio::Tools::Run::Phylo::PhyloBase);
135
136our $PROGRAM_NAME = 'align2binomial.pl';
137our $PROGRAM_DIR;
138
139# methods for the mcs args we support
140our @PARAMS   = qw(neutral percentile mcs specificity sensitivity name);
141our @SWITCHES = qw(neg-score);
142
143# just to be explicit, args we don't support (yet) or we handle ourselves
144our @UNSUPPORTED = qw(ucsc gtf neutral-only fourd-align align-only ar);
145
146BEGIN {
147    # lets add all the mcs scripts to the path so that when we call
148    # align2binomial.pl it can find its siblings
149    $PROGRAM_DIR = $ENV{'MCSDIR'};
150    $ENV{PATH} = "$PROGRAM_DIR:$ENV{PATH}" if $PROGRAM_DIR;
151}
152
153=head2 program_name
154
155 Title   : program_name
156 Usage   : $factory>program_name()
157 Function: holds the program name
158 Returns : string
159 Args    : None
160
161=cut
162
163sub program_name {
164    return $PROGRAM_NAME;
165}
166
167=head2 program_dir
168
169 Title   : program_dir
170 Usage   : $factory->program_dir(@params)
171 Function: returns the program directory, obtained from ENV variable.
172 Returns : string
173 Args    : None
174
175=cut
176
177sub program_dir {
178    return $PROGRAM_DIR;
179}
180
181=head2 new
182
183 Title   : new
184 Usage   : $factory = Bio::Tools::Run::MCS->new()
185 Function: creates a new MCS factory
186 Returns : Bio::Tools::Run::MCS
187 Args    : Many options understood by MCS can be supplied as key =>
188           value pairs.
189
190           These options can NOT be used with this wrapper:
191           ucsc gtf neutral-only fourd-align align-only ar
192
193=cut
194
195sub new {
196    my ($class, @args) = @_;
197    my $self = $class->SUPER::new(@args);
198
199    $self->_set_from_args(\@args, -methods => [@PARAMS, @SWITCHES, 'quiet'],
200                                  -create => 1);
201
202    return $self;
203}
204
205=head2 run
206
207 Title   : run
208 Usage   : $result = $factory->run($align_file_or_object, Bio::Location::Atomic,
209                                   [Bio::SeqFeatureI]);
210 Function: Runs the MCS scripts on an alignment.
211 Returns : list of Bio::SeqFeatureI feature objects (with coordinates corrected
212           according to the supplied offset, if any)
213 Args    : The first argument represents an alignment, the optional second
214           argument represents the chromosome, stand and end and the optional
215           third argument represents annotation of the exons in the alignment.
216
217           The alignment can be provided as a multi-fasta format alignment
218           filename, or a Bio::Align::AlignI compliant object (eg. a
219           Bio::SimpleAlign).
220
221           The position in the genome can be provided as a Bio::Location::Atomic
222           with start, end and seq_id set.
223
224           The annnotation can be provided as an array of Bio::SeqFeatureI
225           objects.
226
227=cut
228
229sub run {
230    my ($self, $aln, $offset, $exon_feats) = @_;
231    $self->_alignment($aln || $self->alignment || $self->throw("An alignment must be supplied"));
232
233    return $self->_run($offset, $exon_feats);
234}
235
236sub _run {
237    my ($self, $atomic, $exon_feats) = @_;
238
239    my $exe = $self->executable || return;
240
241    # cd to a temp dir
242    my $temp_dir = $self->tempdir;
243    my $cwd = Cwd->cwd();
244    chdir($temp_dir) || $self->throw("Couldn't change to temp dir '$temp_dir'");
245
246    my $offset = '';
247    my $start_adjust = 0;
248    if ($atomic) {
249        $start_adjust = $atomic->start;
250        $offset = '--ucsc '.$atomic->seq_id.':'.$start_adjust.'-'.$atomic->end;
251        $start_adjust--;
252    }
253
254    my $gtf_file = 'exons.gtf';
255    if ($exon_feats) {
256        my $fout = Bio::FeatureIO->new(-file => ">$gtf_file", -format => 'gtf');
257        foreach my $feat (@{$exon_feats}) {
258            $fout->write_feature($feat);
259        }
260    }
261    my $gtf = $exon_feats ? "--gtf $gtf_file" : '';
262
263    # step '2' (http://zoo.nhgri.nih.gov/elliott/mcs_doc/node1.html) of MCS:
264    # run align2binomial.pl to calculate individual species binomial scores
265    my $aln_file = $self->_write_alignment;
266    my $error_file = 'stderr';
267    my $binomial_file = 'align_name.binomial';
268    my $cmd = "align2binomial.pl $offset $gtf $aln_file > $binomial_file 2> $error_file";
269    #system("rm -fr $cwd/mcs_dir; cp -R $temp_dir $cwd/mcs_dir");
270    my $throw = system($cmd);
271    open(my $efh, "<", $error_file) || $self->throw("Could not open error file '$error_file'");
272    my $error;
273    while (<$efh>) {
274        $error .= $_;
275        $throw = 1 if /not divisible by 3/;
276    }
277    close($efh);
278    $self->throw($error) if $throw;
279
280    # step '3': run generate_phyloMAX_score.pl to combine the individual
281    # binomial scores and generate the final Multi-species Conservation Score
282    my $phylo_file = 'align_name.phylo';
283    system("generate_phyloMAX_score.pl $binomial_file > $phylo_file") && $self->throw("generate_phyloMAX_score.pl call failed: $?, $!");
284
285    # step '4': Generate MCSs from the conservation score using
286    # generate_mcs_beta.pl
287    my $mcs_file = 'mcs_result.stdout';
288    my $bed_file = 'align_name.bed'; # hardcoded in generate_mcs_beta.pl
289    system("generate_mcs_beta.pl $offset $gtf $phylo_file > $mcs_file") && $self->throw("generate_mcs_beta.pl failed: $?, $!");
290
291    my @feats;
292    my $fin = Bio::FeatureIO->new(-file => $bed_file, -format => 'bed');
293    my $source = Bio::Annotation::SimpleValue->new(-value => 'MCS');
294    while (my $feat = $fin->next_feature()) {
295        # convert coords given offset
296        if ($start_adjust) {
297            $feat->start($feat->start + $start_adjust);
298            $feat->end($feat->end + $start_adjust);
299        }
300        $feat->source($source);
301        push(@feats, $feat);
302    }
303
304    # cd back again
305    chdir($cwd) || $self->throw("Couldn't change back to working directory '$cwd'");
306
307    return @feats;
308}
309
310=head2 _setparams
311
312 Title   : _setparams
313 Usage   : Internal function, not to be called directly
314 Function: Creates a string of params to be used in the command string
315 Returns : string of params
316 Args    : none
317
318=cut
319
320sub _setparams {
321    my $self = shift;
322
323    my $param_string = $self->SUPER::_setparams(-params => \@PARAMS,
324                                           -switches => \@SWITCHES,
325                                           -dash => 1);
326
327    my $null = ($^O =~ m/mswin/i) ? 'NUL' : '/dev/null';
328    $param_string .= " 1>$null" if $self->quiet;
329
330    return $param_string;
331}
332
3331;
334