1package Ace::Sequence;
2use strict;
3
4use Carp;
5use strict;
6use Ace 1.50 qw(:DEFAULT rearrange);
7use Ace::Sequence::FeatureList;
8use Ace::Sequence::Feature;
9use AutoLoader 'AUTOLOAD';
10use vars '$VERSION';
11my %CACHE;
12
13$VERSION = '1.51';
14
15use constant CACHE => 1;
16
17use overload
18  '""'       => 'asString',
19  cmp        => 'cmp',
20;
21
22# synonym: stop = end
23*stop = \&end;
24*abs = \&absolute;
25*source_seq = \&source;
26*source_tag = \&subtype;
27*primary_tag = \&type;
28
29my %plusminus = (	 '+' => '-',
30		 '-' => '+',
31		 '.' => '.');
32
33# internal keys
34#    parent    => reference Sequence in "+" strand
35#    p_offset  => our start in the parent
36#    length    => our length
37#    strand    => our strand (+ or -)
38#    refseq    => reference Sequence for coordinate system
39
40# object constructor
41# usually called like this:
42# $seq = Ace::Sequence->new($object);
43# but can be called like this:
44# $seq = Ace::Sequence->new(-db=>$db,-name=>$name);
45# or
46# $seq = Ace::Sequence->new(-seq    => $object,
47#                           -offset => $offset,
48#                           -length => $length,
49#                           -ref    => $refseq
50#                           );
51# $refseq, if provided, will be used to establish the coordinate
52# system.  Otherwise the first base pair will be set to 1.
53sub new {
54  my $pack = shift;
55  my ($seq,$start,$end,$offset,$length,$refseq,$db) =
56    rearrange([
57	       ['SEQ','SEQUENCE','SOURCE'],
58	      'START',
59	       ['END','STOP'],
60	       ['OFFSET','OFF'],
61	       ['LENGTH','LEN'],
62	       'REFSEQ',
63	       ['DATABASE','DB'],
64	      ],@_);
65
66  # Object must have a parent sequence and/or a reference
67  # sequence.  In some cases, the parent sequence will be the
68  # object itself.  The reference sequence is used to set up
69  # the frame of reference for the coordinate system.
70
71  # fetch the sequence object if we don't have it already
72  croak "Please provide either a Sequence object or a database and name"
73    unless ref($seq) || ($seq && $db);
74
75  # convert start into offset
76  $offset = $start - 1 if defined($start) and !defined($offset);
77
78  # convert stop/end into length
79  $length = ($end > $start) ? $end - $offset : $end - $offset - 2
80    if defined($end) && !defined($length);
81
82  # if just a string is passed, try to fetch a Sequence object
83  my $obj = ref($seq) ? $seq : $db->fetch('Sequence'=>$seq);
84  unless ($obj) {
85    Ace->error("No Sequence named $obj found in database");
86    return;
87  }
88
89  # get parent coordinates and length of this sequence
90  # the parent is an Ace Sequence object in the "+" strand
91  my ($parent,$p_offset,$p_length,$strand) = find_parent($obj);
92  return unless $parent;
93
94  # handle negative strands
95  my $r_strand = $strand;
96  my $r_offset = $p_offset;
97  $offset ||= 0;
98  $offset *= -1 if $strand < 0;
99
100  # handle feature objects
101  $offset += $obj->offset if $obj->can('smapped');
102
103  # get source
104  my $source = $obj->can('smapped') ? $obj->source : $obj;
105
106  # store the object into our instance variables
107  my $self = bless {
108		    obj        => $source,
109		    offset     => $offset,
110		    length     => $length || $p_length,
111		    parent     => $parent,
112		    p_offset   => $p_offset,
113		    refseq     => [$source,$r_offset,$r_strand],
114		    strand     => $strand,
115		    absolute   => 0,
116		    automerge  => 1,
117		   },$pack;
118
119  # set the reference sequence
120  eval { $self->refseq($refseq) } or return if defined $refseq;
121
122  # wheww!
123  return $self;
124}
125
126# return the "source" object that the user offset from
127sub source {
128  $_[0]->{obj};
129}
130
131# return the parent object
132sub parent { $_[0]->{parent} }
133
134# return the length
135#sub length { $_[0]->{length} }
136sub length {
137  my $self = shift;
138  my ($start,$end) = ($self->start,$self->end);
139  return $end - $start + ($end > $start ? 1 : -1);  # for stupid 1-based adjustments
140}
141
142sub reversed {  return shift->strand < 0; }
143
144sub automerge {
145  my $self = shift;
146  my $d = $self->{automerge};
147  $self->{automerge} = shift if @_;
148  $d;
149}
150
151# return reference sequence
152sub refseq {
153  my $self = shift;
154  my $prev = $self->{refseq};
155  if (@_) {
156    my $refseq = shift;
157    my $arrayref;
158
159  BLOCK: {
160      last BLOCK unless defined ($refseq);
161
162      if (ref($refseq) && ref($refseq) eq 'ARRAY') {
163	$arrayref = $refseq;
164	last BLOCK;
165      }
166
167      if (ref($refseq) && ($refseq->can('smapped'))) {
168	croak "Reference sequence has no common ancestor with sequence"
169	  unless $self->parent eq $refseq->parent;
170	my ($a,$b,$c) = @{$refseq->{refseq}};
171	#	$b += $refseq->offset;
172	$b += $refseq->offset;
173	$arrayref = [$refseq,$b,$refseq->strand];
174	last BLOCK;
175      }
176
177
178      # look up reference sequence in database if we aren't given
179      # database object already
180      $refseq = $self->db->fetch('Sequence' => $refseq)
181	unless $refseq->isa('Ace::Object');
182      croak "Invalid reference sequence" unless $refseq;
183
184      # find position of ref sequence in parent strand
185      my ($r_parent,$r_offset,$r_length,$r_strand) = find_parent($refseq);
186      croak "Reference sequence has no common ancestor with sequence"
187	unless $r_parent eq $self->{parent};
188
189      # set to array reference containing this information
190      $arrayref = [$refseq,$r_offset,$r_strand];
191    }
192    $self->{refseq} = $arrayref;
193  }
194  return unless $prev;
195  return $self->parent if $self->absolute;
196  return wantarray ? @{$prev} : $prev->[0];
197}
198
199# return strand
200sub strand { return $_[0]->{strand} }
201
202# return reference strand
203sub r_strand {
204  my $self = shift;
205  return "+1" if $self->absolute;
206  if (my ($ref,$r_offset,$r_strand) = $self->refseq) {
207    return $r_strand;
208  } else {
209    return $self->{strand}
210  }
211}
212
213sub offset { $_[0]->{offset} }
214sub p_offset { $_[0]->{p_offset} }
215
216sub smapped { 1; }
217sub type    { 'Sequence' }
218sub subtype { }
219
220sub debug {
221  my $self = shift;
222  my $d = $self->{_debug};
223  $self->{_debug} = shift if @_;
224  $d;
225}
226
227# return the database this sequence is associated with
228sub db {
229  return Ace->name2db($_[0]->{db} ||= $_[0]->source->db);
230}
231
232sub start {
233  my ($self,$abs) = @_;
234  $abs = $self->absolute unless defined $abs;
235  return $self->{p_offset} + $self->{offset} + 1 if $abs;
236
237  if ($self->refseq) {
238    my ($ref,$r_offset,$r_strand) = $self->refseq;
239    return $r_strand < 0 ? 1 + $r_offset - ($self->{p_offset} + $self->{offset})
240                         : 1 + $self->{p_offset} + $self->{offset} - $r_offset;
241  }
242
243  else {
244    return $self->{offset} +1;
245  }
246
247}
248
249sub end {
250  my ($self,$abs) = @_;
251  my $start = $self->start($abs);
252  my $f = $self->{length} > 0 ? 1 : -1;  # for stupid 1-based adjustments
253  if ($abs && $self->refseq ne $self->parent) {
254    my $r_strand = $self->r_strand;
255    return $start - $self->{length} + $f
256      if $r_strand < 0 or $self->{strand} < 0 or $self->{length} < 0;
257    return  $start + $self->{length} - $f
258  }
259  return  $start + $self->{length} - $f if $self->r_strand eq $self->{strand};
260  return  $start - $self->{length} + $f;
261}
262
263# turn on absolute coordinates (relative to reference sequence)
264sub absolute {
265  my $self = shift;
266  my $prev = $self->{absolute};
267  $self->{absolute} = $_[0] if defined $_[0];
268  return $prev;
269}
270
271# human readable string (for debugging)
272sub asString {
273  my $self = shift;
274  if ($self->absolute) {
275    return join '',$self->parent,'/',$self->start,',',$self->end;
276  } elsif (my $ref = $self->refseq){
277    my $label = $ref->isa('Ace::Sequence::Feature') ? $ref->info : "$ref";
278    return join '',$label,'/',$self->start,',',$self->end;
279
280  } else {
281    join '',$self->source,'/',$self->start,',',$self->end;
282  }
283}
284
285sub cmp {
286  my ($self,$arg,$reversed) = @_;
287  if (ref($arg) and $arg->isa('Ace::Sequence')) {
288    my $cmp = $self->parent cmp $arg->parent
289      || $self->start <=> $arg->start;
290    return $reversed ? -$cmp : $cmp;
291  }
292  my $name = $self->asString;
293  return $reversed ? $arg cmp $name : $name cmp $arg;
294}
295
296# Return the DNA
297sub dna {
298  my $self = shift;
299  return $self->{dna} if $self->{dna};
300  my $raw = $self->_query('seqdna');
301  $raw=~s/^>.*\n//m;
302  $raw=~s/^\/\/.*//mg;
303  $raw=~s/\n//g;
304  $raw =~ s/\0+\Z//; # blasted nulls!
305  my $effective_strand = $self->end >= $self->start ? '+1' : '-1';
306  _complement(\$raw) if $self->r_strand ne $effective_strand;
307  return $self->{dna} = $raw;
308}
309
310# return a gff file
311sub gff {
312  my $self = shift;
313  my ($abs,$features) = rearrange([['ABS','ABSOLUTE'],'FEATURES'],@_);
314  $abs = $self->absolute unless defined $abs;
315
316  # can provide list of feature names, such as 'similarity', or 'all' to get 'em all
317  #  !THIS IS BROKEN; IT SHOULD LOOK LIKE FEATURE()!
318  my $opt = $self->_feature_filter($features);
319
320  my $gff = $self->_gff($opt);
321  warn $gff if $self->debug;
322
323  $self->transformGFF(\$gff) unless $abs;
324  return $gff;
325}
326
327# return a GFF object using the optional GFF.pm module
328sub GFF {
329  my $self = shift;
330  my ($filter,$converter) = @_;  # anonymous subs
331  croak "GFF module not installed" unless require GFF;
332  require GFF::Filehandle;
333
334  my @lines = grep !/^\/\//,split "\n",$self->gff(@_);
335  local *IN;
336  local ($^W) = 0;  # prevent complaint by GFF module
337  tie *IN,'GFF::Filehandle',\@lines;
338  my $gff = GFF::GeneFeatureSet->new;
339  $gff->read(\*IN,$filter,$converter) if $gff;
340  return $gff;
341}
342
343# Get the features table.  Can filter by type/subtype this way:
344# features('similarity:EST','annotation:assembly_tag')
345sub features {
346  my $self = shift;
347  my ($filter,$opt) = $self->_make_filter(@_);
348
349  # get raw gff file
350  my $gff = $self->gff(-features=>$opt);
351
352  # turn it into a list of features
353  my @features = $self->_make_features($gff,$filter);
354
355  if ($self->automerge) {  # automatic merging
356    # fetch out constructed transcripts and clones
357    my %types = map {lc($_)=>1} (@$opt,@_);
358    if ($types{'transcript'}) {
359      push @features,$self->_make_transcripts(\@features);
360      @features = grep {$_->type !~ /^(intron|exon)$/ } @features;
361    }
362    push @features,$self->_make_clones(\@features)      if $types{'clone'};
363    if ($types{'similarity'}) {
364      my @f = $self->_make_alignments(\@features);
365      @features = grep {$_->type ne 'similarity'} @features;
366      push @features,@f;
367    }
368  }
369
370  return wantarray ? @features : \@features;
371}
372
373# A little bit more complex - assemble a list of "transcripts"
374# consisting of Ace::Sequence::Transcript objects.  These objects
375# contain a list of exons and introns.
376sub transcripts {
377  my $self    = shift;
378  my $curated = shift;
379  my $ef       = $curated ? "exon:curated"   : "exon";
380  my $if       = $curated ? "intron:curated" : "intron";
381  my $sf       = $curated ? "Sequence:curated" : "Sequence";
382  my @features = $self->features($ef,$if,$sf);
383  return unless @features;
384  return $self->_make_transcripts(\@features);
385}
386
387sub _make_transcripts {
388  my $self = shift;
389  my $features = shift;
390
391  require Ace::Sequence::Transcript;
392  my %transcripts;
393
394  for my $feature (@$features) {
395    my $transcript = $feature->info;
396    next unless $transcript;
397    if ($feature->type =~ /^(exon|intron|cds)$/) {
398      my $type = $1;
399      push @{$transcripts{$transcript}{$type}},$feature;
400    } elsif ($feature->type eq 'Sequence') {
401      $transcripts{$transcript}{base} ||= $feature;
402    }
403  }
404
405  # get rid of transcripts without exons
406  foreach (keys %transcripts) {
407    delete $transcripts{$_} unless exists $transcripts{$_}{exon}
408  }
409
410  # map the rest onto Ace::Sequence::Transcript objects
411  return map {Ace::Sequence::Transcript->new($transcripts{$_})} keys %transcripts;
412}
413
414# Reassemble clones from clone left and right ends
415sub clones {
416  my $self = shift;
417  my @clones = $self->features('Clone_left_end','Clone_right_end','Sequence');
418  my %clones;
419  return unless @clones;
420  return $self->_make_clones(\@clones);
421}
422
423sub _make_clones {
424  my $self = shift;
425  my $features = shift;
426
427  my (%clones,@canonical_clones);
428  my $start_label = $self->strand < 0 ? 'end' : 'start';
429  my $end_label   = $self->strand < 0 ? 'start' : 'end';
430  for my $feature (@$features) {
431    $clones{$feature->info}{$start_label} = $feature->start if $feature->type eq 'Clone_left_end';
432    $clones{$feature->info}{$end_label}   = $feature->start if $feature->type eq 'Clone_right_end';
433
434    if ($feature->type eq 'Sequence') {
435      my $info = $feature->info;
436      next if $info =~ /LINK|CHROMOSOME|\.\w+$/;
437      if ($info->Genomic_canonical(0)) {
438	push (@canonical_clones,$info->Clone) if $info->Clone;
439      }
440    }
441  }
442
443  foreach (@canonical_clones) {
444    $clones{$_} ||= {};
445  }
446
447  my @features;
448  my ($r,$r_offset,$r_strand) = $self->refseq;
449  my $parent = $self->parent;
450  my $abs = $self->absolute;
451  if ($abs) {
452    $r_offset  = 0;
453    $r = $parent;
454    $r_strand = '+1';
455  }
456
457  # BAD HACK ALERT.  WE DON'T KNOW WHERE THE LEFT END OF THE CLONE IS SO WE USE
458  # THE MAGIC NUMBER -99_999_999 to mean "off left end" and
459  # +99_999_999 to mean "off right end"
460  for my $clone (keys %clones) {
461    my $start = $clones{$clone}{start} || -99_999_999;
462    my $end   = $clones{$clone}{end}   || +99_999_999;
463    my $phony_gff = join "\t",($parent,'Clone','structural',$start,$end,'.','.','.',qq(Clone "$clone"));
464    push @features,Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$phony_gff);
465  }
466  return @features;
467}
468
469# Assemble a list of "GappedAlignment" objects. These objects
470# contain a list of aligned segments.
471sub alignments {
472  my $self    = shift;
473  my @subtypes = @_;
474  my @types = map { "similarity:\^$_\$" } @subtypes;
475  push @types,'similarity' unless @types;
476  return $self->features(@types);
477}
478
479sub segments {
480  my $self = shift;
481  return;
482}
483
484sub _make_alignments {
485  my $self = shift;
486  my $features = shift;
487  require Ace::Sequence::GappedAlignment;
488
489  my %homol;
490
491  for my $feature (@$features) {
492    next unless $feature->type eq 'similarity';
493    my $target = $feature->info;
494    my $subtype = $feature->subtype;
495    push @{$homol{$target,$subtype}},$feature;
496  }
497
498  # map onto Ace::Sequence::GappedAlignment objects
499  return map {Ace::Sequence::GappedAlignment->new($homol{$_})} keys %homol;
500}
501
502# return list of features quickly
503sub feature_list {
504  my $self = shift;
505  return $self->{'feature_list'} if $self->{'feature_list'};
506  return unless my $raw = $self->_query('seqfeatures -version 2 -list');
507  return $self->{'feature_list'} = Ace::Sequence::FeatureList->new($raw);
508}
509
510# transform a GFF file into the coordinate system of the sequence
511sub transformGFF {
512  my $self = shift;
513  my $gff = shift;
514  my $parent  = $self->parent;
515  my $strand  = $self->{strand};
516  my $source  = $self->source;
517  my ($ref_source,$ref_offset,$ref_strand)  = $self->refseq;
518  $ref_source ||= $source;
519  $ref_strand ||= $strand;
520
521  if ($ref_strand > 0) {
522    my $o = defined($ref_offset) ? $ref_offset : ($self->p_offset + $self->offset);
523    # find anything that looks like a numeric field and subtract offset from it
524    $$gff =~ s/(?<!\")\s+(-?\d+)\s+(-?\d+)/"\t" . ($1 - $o) . "\t" . ($2 - $o)/eg;
525    $$gff =~ s/^$parent/$source/mg;
526    $$gff =~ s/\#\#sequence-region\s+\S+/##sequence-region $ref_source/m;
527    $$gff =~ s/FMAP_FEATURES\s+"\S+"/FMAP_FEATURES "$ref_source"/m;
528    return;
529  } else {  # strand eq '-'
530    my $o = defined($ref_offset) ? (2 + $ref_offset) : (2 + $self->p_offset - $self->offset);
531    $$gff =~ s/(?<!\")\s+(-?\d+)\s+(-?\d+)\s+([.\d]+)\s+(\S)/join "\t",'',$o-$2,$o-$1,$3,$plusminus{$4}/eg;
532    $$gff =~ s/(Target \"[^\"]+\" )(-?\d+) (-?\d+)/$1 $3 $2/g;
533    $$gff =~ s/^$parent/$source/mg;
534    $$gff =~ s/\#\#sequence-region\s+\S+\s+(-?\d+)\s+(-?\d+)/"##sequence-region $ref_source " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em;
535    $$gff =~ s/FMAP_FEATURES\s+"\S+"\s+(-?\d+)\s+(-?\d+)/"FMAP_FEATURES \"$ref_source\" " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em;
536  }
537
538}
539
540# return a name for the object
541sub name {
542  return shift->source_seq->name;
543}
544
545# for compatibility with Ace::Sequence::Feature
546sub info {
547  return shift->source_seq;
548}
549
550###################### internal functions #################
551# not necessarily object-oriented!!
552
553# return parent, parent offset and strand
554sub find_parent {
555  my $obj = shift;
556
557  # first, if we are passed an Ace::Sequence, then we can inherit
558  # these settings directly
559  return (@{$obj}{qw(parent p_offset length)},$obj->r_strand)
560    if $obj->isa('Ace::Sequence');
561
562  # otherwise, if we are passed an Ace::Object, then we must
563  # traverse upwards until we find a suitable parent
564  return _traverse($obj) if $obj->isa('Ace::Object');
565
566  # otherwise, we don't know what to do...
567  croak "Source sequence not an Ace::Object or an Ace::Sequence";
568}
569
570sub _get_parent {
571  my $obj = shift;
572  # ** DANGER DANGER WILL ROBINSON! **
573  # This is an experiment in caching parents to speed lookups.  Probably eats memory voraciously.
574  return $CACHE{$obj} if CACHE && exists $CACHE{$obj};
575  my $p = $obj->get(S_Parent=>2)|| $obj->get(Source=>1);
576  return unless $p;
577  return CACHE ? $CACHE{$obj} = $p->fetch
578               : $p->fetch;
579}
580
581sub _get_children {
582  my $obj = shift;
583  my @pieces = $obj->get(S_Child=>2);
584  return @pieces if @pieces;
585  return @pieces = $obj->get('Subsequence');
586}
587
588# get sequence, offset and strand of topmost container
589sub _traverse {
590  my $obj = shift;
591  my ($offset,$length);
592
593  # invoke seqget to find the top-level container for this sequence
594  my ($tl,$tl_start,$tl_end) = _get_toplevel($obj);
595  $tl_start ||= 0;
596  $tl_end ||= 0;
597
598  # make it an object
599  $tl = ref($obj)->new(-name=>$tl,-class=>'Sequence',-db=>$obj->db);
600
601  $offset += $tl_start - 1;  # offset to beginning of toplevel
602  $length ||= abs($tl_end - $tl_start) + 1;
603  my $strand = $tl_start < $tl_end ? +1 : -1;
604
605  return ($tl,$offset,$strand < 0 ? ($length,'-1') : ($length,'+1') ) if $length;
606}
607
608sub _get_toplevel {
609  my $obj = shift;
610  my $class = $obj->class;
611  my $name  = $obj->name;
612
613  my $smap = $obj->db->raw_query("gif smap -from $class:$name");
614  my ($parent,$pstart,$pstop,$tstart,$tstop,$map_type) =
615    $smap =~ /^SMAP\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)/;
616
617  $parent ||= '';
618  $parent =~ s/^Sequence://;  # remove this in next version of Acedb
619  return ($parent,$pstart,$pstop);
620}
621
622# create subroutine that filters GFF files for certain feature types
623sub _make_filter {
624  my $self = shift;
625  my $automerge = $self->automerge;
626
627  # parse out the filter
628  my %filter;
629  foreach (@_) {
630    my ($type,$filter) = split(':',$_,2);
631    if ($automerge && lc($type) eq 'transcript') {
632      @filter{'exon','intron','Sequence','cds'} = ([undef],[undef],[undef],[undef]);
633    } elsif ($automerge && lc($type) eq 'clone') {
634      @filter{'Clone_left_end','Clone_right_end','Sequence'} = ([undef],[undef],[undef]);
635    } else {
636      push @{$filter{$type}},$filter;
637    }
638  }
639
640  # create pattern-match sub
641  my $sub;
642  my $promiscuous;  # indicates that there is a subtype without a type
643
644  if (%filter) {
645    my $s = "sub { my \@d = split(\"\\t\",\$_[0]);\n";
646    for my $type (keys %filter) {
647      my $expr;
648      my $subtypes = $filter{$type};
649      if ($type ne '') {
650	for my $st (@$subtypes) {
651	  $expr .= defined $st ? "return 1 if \$d[2]=~/$type/i && \$d[1]=~/$st/i;\n"
652	                       : "return 1 if \$d[2]=~/$type/i;\n"
653	}
654      } else {  # no type, only subtypes
655	$promiscuous++;
656	for my $st (@$subtypes) {
657	  next unless defined $st;
658	  $expr .= "return 1 if \$d[1]=~/$st/i;\n";
659	}
660      }
661      $s .= $expr;
662    }
663    $s .= "return;\n }";
664
665    $sub = eval $s;
666    croak $@ if $@;
667  } else {
668    $sub = sub { 1; }
669  }
670  return ($sub,$promiscuous ? [] : [keys %filter]);
671}
672
673# turn a GFF file and a filter into a list of Ace::Sequence::Feature objects
674sub _make_features {
675  my $self = shift;
676  my ($gff,$filter) = @_;
677
678  my ($r,$r_offset,$r_strand) = $self->refseq;
679  my $parent = $self->parent;
680  my $abs    = $self->absolute;
681  if ($abs) {
682    $r_offset  = 0;
683    $r = $parent;
684    $r_strand = '+1';
685  }
686  my @features = map {Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$_)}
687                 grep !m@^(?:\#|//)@ && $filter->($_),split("\n",$gff);
688}
689
690
691# low level GFF call, no changing absolute to relative coordinates
692sub _gff {
693  my $self = shift;
694  my ($opt,$db) = @_;
695  my $data = $self->_query("seqfeatures -version 2 $opt",$db);
696  $data =~ s/\0+\Z//;
697  return $data; #blasted nulls!
698}
699
700# shortcut for running a gif query
701sub _query {
702  my $self = shift;
703  my $command = shift;
704  my $db      = shift || $self->db;
705
706  my $parent = $self->parent;
707  my $start = $self->start(1);
708  my $end   = $self->end(1);
709  ($start,$end) = ($end,$start) if $start > $end;  #flippity floppity
710
711  my $coord   = "-coords $start $end";
712
713  # BAD BAD HACK ALERT - CHECKS THE QUERY THAT IS PASSED DOWN
714  # ALSO MAKES THINGS INCOMPATIBLE WITH PRIOR 4.9 servers.
715#  my $opt     = $command =~ /seqfeatures/ ? '-nodna' : '';
716  my $opt = '-noclip';
717
718  my $query = "gif seqget $parent $opt $coord ; $command";
719  warn $query if $self->debug;
720
721  return $db->raw_query("gif seqget $parent $opt $coord ; $command");
722}
723
724# utility function -- reverse complement
725sub _complement {
726  my $dna = shift;
727  $$dna =~ tr/GATCgatc/CTAGctag/;
728  $$dna = scalar reverse $$dna;
729}
730
731sub _feature_filter {
732  my $self = shift;
733  my $features = shift;
734  return '' unless $features;
735  my $opt = '';
736  $opt = '+feature ' . join('|',@$features) if ref($features) eq 'ARRAY' && @$features;
737  $opt = "+feature $features" unless ref $features;
738  $opt;
739}
740
7411;
742
743=head1 NAME
744
745Ace::Sequence - Examine ACeDB Sequence Objects
746
747=head1 SYNOPSIS
748
749    # open database connection and get an Ace::Object sequence
750    use Ace::Sequence;
751
752    $db  = Ace->connect(-host => 'stein.cshl.org',-port => 200005);
753    $obj = $db->fetch(Predicted_gene => 'ZK154.3');
754
755    # Wrap it in an Ace::Sequence object
756    $seq = Ace::Sequence->new($obj);
757
758    # Find all the exons
759    @exons = $seq->features('exon');
760
761    # Find all the exons predicted by various versions of "genefinder"
762    @exons = $seq->features('exon:genefinder.*');
763
764    # Iterate through the exons, printing their start, end and DNA
765    for my $exon (@exons) {
766      print join "\t",$exon->start,$exon->end,$exon->dna,"\n";
767    }
768
769    # Find the region 1000 kb upstream of the first exon
770    $sub = Ace::Sequence->new(-seq=>$exons[0],
771                              -offset=>-1000,-length=>1000);
772
773    # Find all features in that area
774    @features = $sub->features;
775
776    # Print its DNA
777    print $sub->dna;
778
779    # Create a new Sequence object from the first 500 kb of chromosome 1
780    $seq = Ace::Sequence->new(-name=>'CHROMOSOME_I',-db=>$db,
781			      -offset=>0,-length=>500_000);
782
783    # Get the GFF dump as a text string
784    $gff = $seq->gff;
785
786    # Limit dump to Predicted_genes
787    $gff_genes = $seq->gff(-features=>'Predicted_gene');
788
789    # Return a GFF object (using optional GFF.pm module from Sanger)
790    $gff_obj = $seq->GFF;
791
792=head1 DESCRIPTION
793
794I<Ace::Sequence>, and its allied classes L<Ace::Sequence::Feature> and
795L<Ace::Sequence::FeatureList>, provide a convenient interface to the
796ACeDB Sequence classes and the GFF sequence feature file format.
797
798Using this class, you can define a region of the genome by using a
799landmark (sequenced clone, link, superlink, predicted gene), an offset
800from that landmark, and a distance.  Offsets and distances can be
801positive or negative.  This will return an I<Ace::Sequence> object.
802Once a region is defined, you may retrieve its DNA sequence, or query
803the database for any features that may be contained within this
804region.  Features can be returned as objects (using the
805I<Ace::Sequence::Feature> class), as GFF text-only dumps, or in the
806form of the GFF class defined by the Sanger Centre's GFF.pm module.
807
808This class builds on top of L<Ace> and L<Ace::Object>.  Please see
809their manual pages before consulting this one.
810
811=head1 Creating New Ace::Sequence Objects, the new() Method
812
813 $seq = Ace::Sequence->new($object);
814
815 $seq = Ace::Sequence->new(-source  => $object,
816                           -offset  => $offset,
817                           -length  => $length,
818			   -refseq  => $reference_sequence);
819
820 $seq = Ace::Sequence->new(-name    => $name,
821			   -db      => $db,
822                           -offset  => $offset,
823                           -length  => $length,
824			   -refseq  => $reference_sequence);
825
826In order to create an I<Ace::Sequence> you will need an active I<Ace>
827database accessor.  Sequence regions are defined using a "source"
828sequence, an offset, and a length.  Optionally, you may also provide a
829"reference sequence" to establish the coordinate system for all
830inquiries.  Sequences may be generated from existing I<Ace::Object>
831sequence objects, from other I<Ace::Sequence> and
832I<Ace::Sequence::Feature> objects, or from a sequence name and a
833database handle.
834
835The class method named new() is the interface to these facilities.  In
836its simplest, one-argument form, you provide new() with a
837previously-created I<Ace::Object> that points to Sequence or
838sequence-like object (the meaning of "sequence-like" is explained in
839more detail below.)  The new() method will return an I<Ace::Sequence>
840object extending from the beginning of the object through to its
841natural end.
842
843In the named-parameter form of new(), the following arguments are
844recognized:
845
846=over 4
847
848=item -source
849
850The sequence source.  This must be an I<Ace::Object> of the "Sequence"
851class, or be a sequence-like object containing the SMap tag (see
852below).
853
854=item -offset
855
856An offset from the beginning of the source sequence.  The retrieved
857I<Ace::Sequence> will begin at this position.  The offset can be any
858positive or negative integer.  Offets are B<0-based>.
859
860=item -length
861
862The length of the sequence to return.  Either a positive or negative
863integer can be specified.  If a negative length is given, the returned
864sequence will be complemented relative to the source sequence.
865
866=item -refseq
867
868The sequence to use to establish the coordinate system for the
869returned sequence.  Normally the source sequence is used to establish
870the coordinate system, but this can be used to override that choice.
871You can provide either an I<Ace::Object> or just a sequence name for
872this argument.  The source and reference sequences must share a common
873ancestor, but do not have to be directly related.  An attempt to use a
874disjunct reference sequence, such as one on a different chromosome,
875will fail.
876
877=item -name
878
879As an alternative to using an I<Ace::Object> with the B<-source>
880argument, you may specify a source sequence using B<-name> and B<-db>.
881The I<Ace::Sequence> module will use the provided database accessor to
882fetch a Sequence object with the specified name. new() will return
883undef is no Sequence by this name is known.
884
885=item -db
886
887This argument is required if the source sequence is specified by name
888rather than by object reference.
889
890=back
891
892If new() is successful, it will create an I<Ace::Sequence> object and
893return it.  Otherwise it will return undef and return a descriptive
894message in Ace->error().  Certain programming errors, such as a
895failure to provide required arguments, cause a fatal error.
896
897=head2 Reference Sequences and the Coordinate System
898
899When retrieving information from an I<Ace::Sequence>, the coordinate
900system is based on the sequence segment selected at object creation
901time.  That is, the "+1" strand is the natural direction of the
902I<Ace::Sequence> object, and base pair 1 is its first base pair.  This
903behavior can be overridden by providing a reference sequence to the
904new() method, in which case the orientation and position of the
905reference sequence establishes the coordinate system for the object.
906
907In addition to the reference sequence, there are two other sequences
908used by I<Ace::Sequence> for internal bookeeping.  The "source"
909sequence corresponds to the smallest ACeDB sequence object that
910completely encloses the selected sequence segment.  The "parent"
911sequence is the smallest ACeDB sequence object that contains the
912"source".  The parent is used to derive the length and orientation of
913source sequences that are not directly associated with DNA objects.
914
915In many cases, the source sequence will be identical to the sequence
916initially passed to the new() method.  However, there are exceptions
917to this rule.  One common exception occurs when the offset and/or
918length cross the boundaries of the passed-in sequence.  In this case,
919the ACeDB database is searched for the smallest sequence that contains
920both endpoints of the I<Ace::Sequence> object.
921
922The other common exception occurs in Ace 4.8, where there is support
923for "sequence-like" objects that contain the C<SMap> ("Sequence Map")
924tag.  The C<SMap> tag provides genomic location information for
925arbitrary object -- not just those descended from the Sequence class.
926This allows ACeDB to perform genome map operations on objects that are
927not directly related to sequences, such as genetic loci that have been
928interpolated onto the physical map.  When an C<SMap>-containing object
929is passed to the I<Ace::Sequence> new() method, the module will again
930choose the smallest ACeDB Sequence object that contains both
931end-points of the desired region.
932
933If an I<Ace::Sequence> object is used to create a new I<Ace::Sequence>
934object, then the original object's source is inherited.
935
936=head1 Object Methods
937
938Once an I<Ace::Sequence> object is created, you can query it using the
939following methods:
940
941=head2 asString()
942
943  $name = $seq->asString;
944
945Returns a human-readable identifier for the sequence in the form
946I<Source/start-end>, where "Source" is the name of the source
947sequence, and "start" and "end" are the endpoints of the sequence
948relative to the source (using 1-based indexing).  This method is
949called automatically when the I<Ace::Sequence> is used in a string
950context.
951
952=head2 source_seq()
953
954  $source = $seq->source_seq;
955
956Return the source of the I<Ace::Sequence>.
957
958=head2 parent_seq()
959
960  $parent = $seq->parent_seq;
961
962Return the immediate ancestor of the sequence.  The parent of the
963top-most sequence (such as the CHROMOSOME link) is itself.  This
964method is used internally to ascertain the length of source sequences
965which are not associated with a DNA object.
966
967NOTE: this procedure is a trifle funky and cannot reliably be used to
968traverse upwards to the top-most sequence.  The reason for this is
969that it will return an I<Ace::Sequence> in some cases, and an
970I<Ace::Object> in others.  Use get_parent() to traverse upwards
971through a uniform series of I<Ace::Sequence> objects upwards.
972
973=head2 refseq([$seq])
974
975  $refseq = $seq->refseq;
976
977Returns the reference sequence, if one is defined.
978
979  $seq->refseq($new_ref);
980
981Set the reference sequence. The reference sequence must share the same
982ancestor with $seq.
983
984=head2 start()
985
986  $start = $seq->start;
987
988Start of this sequence, relative to the source sequence, using 1-based
989indexing.
990
991=head2 end()
992
993  $end = $seq->end;
994
995End of this sequence, relative to the source sequence, using 1-based
996indexing.
997
998=head2 offset()
999
1000  $offset = $seq->offset;
1001
1002Offset of the beginning of this sequence relative to the source
1003sequence, using 0-based indexing.  The offset may be negative if the
1004beginning of the sequence is to the left of the beginning of the
1005source sequence.
1006
1007=head2 length()
1008
1009  $length = $seq->length;
1010
1011The length of this sequence, in base pairs.  The length may be
1012negative if the sequence's orientation is reversed relative to the
1013source sequence.  Use abslength() to obtain the absolute value of
1014the sequence length.
1015
1016=head2 abslength()
1017
1018  $length = $seq->abslength;
1019
1020Return the absolute value of the length of the sequence.
1021
1022=head2 strand()
1023
1024  $strand = $seq->strand;
1025
1026Returns +1 for a sequence oriented in the natural direction of the
1027genomic reference sequence, or -1 otherwise.
1028
1029=head2 reversed()
1030
1031Returns true if the segment is reversed relative to the canonical
1032genomic direction.  This is the same as $seq->strand < 0.
1033
1034=head2 dna()
1035
1036  $dna = $seq->dna;
1037
1038Return the DNA corresponding to this sequence.  If the sequence length
1039is negative, the reverse complement of the appropriate segment will be
1040returned.
1041
1042ACeDB allows Sequences to exist without an associated DNA object
1043(which typically happens during intermediate stages of a sequencing
1044project.  In such a case, the returned sequence will contain the
1045correct number of "-" characters.
1046
1047=head2 name()
1048
1049  $name = $seq->name;
1050
1051Return the name of the source sequence as a string.
1052
1053=head2 get_parent()
1054
1055  $parent = $seq->parent;
1056
1057Return the immediate ancestor of this I<Ace::Sequence> (i.e., the
1058sequence that contains this one).  The return value is a new
1059I<Ace::Sequence> or undef, if no parent sequence exists.
1060
1061=head2 get_children()
1062
1063  @children = $seq->get_children();
1064
1065Returns all subsequences that exist as independent objects in the
1066ACeDB database.  What exactly is returned is dependent on the data
1067model.  In older ACeDB databases, the only subsequences are those
1068under the catchall Subsequence tag.  In newer ACeDB databases, the
1069objects returned correspond to objects to the right of the S_Child
1070subtag using a tag[2] syntax, and may include Predicted_genes,
1071Sequences, Links, or other objects.  The return value is a list of
1072I<Ace::Sequence> objects.
1073
1074=head2 features()
1075
1076  @features = $seq->features;
1077  @features = $seq->features('exon','intron','Predicted_gene');
1078  @features = $seq->features('exon:GeneFinder','Predicted_gene:hand.*');
1079
1080features() returns an array of I<Sequence::Feature> objects.  If
1081called without arguments, features() returns all features that cross
1082the sequence region.  You may also provide a filter list to select a
1083set of features by type and subtype.  The format of the filter list
1084is:
1085
1086  type:subtype
1087
1088Where I<type> is the class of the feature (the "feature" field of the
1089GFF format), and I<subtype> is a description of how the feature was
1090derived (the "source" field of the GFF format).  Either of these
1091fields can be absent, and either can be a regular expression.  More
1092advanced filtering is not supported, but is provided by the Sanger
1093Centre's GFF module.
1094
1095The order of the features in the returned list is not specified.  To
1096obtain features sorted by position, use this idiom:
1097
1098  @features = sort { $a->start <=> $b->start } $seq->features;
1099
1100=head2 feature_list()
1101
1102  my $list = $seq->feature_list();
1103
1104This method returns a summary list of the features that cross the
1105sequence in the form of a L<Ace::Feature::List> object.  From the
1106L<Ace::Feature::List> object you can obtain the list of feature names
1107and the number of each type.  The feature list is obtained from the
1108ACeDB server with a single short transaction, and therefore has much
1109less overhead than features().
1110
1111See L<Ace::Feature::List> for more details.
1112
1113=head2 transcripts()
1114
1115This returns a list of Ace::Sequence::Transcript objects, which are
1116specializations of Ace::Sequence::Feature.  See L<Ace::Sequence::Transcript>
1117for details.
1118
1119=head2 clones()
1120
1121This returns a list of Ace::Sequence::Feature objects containing
1122reconstructed clones.  This is a nasty hack, because ACEDB currently
1123records clone ends, but not the clones themselves, meaning that we
1124will not always know both ends of the clone.  In this case the missing
1125end has a synthetic position of -99,999,999 or +99,999,999.  Sorry.
1126
1127=head2 gff()
1128
1129  $gff = $seq->gff();
1130  $gff = $seq->gff(-abs      => 1,
1131                   -features => ['exon','intron:GeneFinder']);
1132
1133This method returns a GFF file as a scalar.  The following arguments
1134are optional:
1135
1136=over 4
1137
1138=item -abs
1139
1140Ordinarily the feature entries in the GFF file will be returned in
1141coordinates relative to the start of the I<Ace::Sequence> object.
1142Position 1 will be the start of the sequence object, and the "+"
1143strand will be the sequence object's natural orientation.  However if
1144a true value is provided to B<-abs>, the coordinate system used will
1145be relative to the start of the source sequence, i.e. the native ACeDB
1146Sequence object (usually a cosmid sequence or a link).
1147
1148If a reference sequence was provided when the I<Ace::Sequence> was
1149created, it will be used by default to set the coordinate system.
1150Relative coordinates can be reenabled by providing a false value to
1151B<-abs>.
1152
1153Ordinarily the coordinate system manipulations automatically "do what
1154you want" and you will not need to adjust them.  See also the abs()
1155method described below.
1156
1157=item -features
1158
1159The B<-features> argument filters the features according to a list of
1160types and subtypes.  The format is identical to the one described for
1161the features() method.  A single filter may be provided as a scalar
1162string.  Multiple filters may be passed as an array reference.
1163
1164=back
1165
1166See also the GFF() method described next.
1167
1168=head2 GFF()
1169
1170  $gff_object = $seq->gff;
1171  $gff_object = $seq->gff(-abs      => 1,
1172                   -features => ['exon','intron:GeneFinder']);
1173
1174The GFF() method takes the same arguments as gff() described above,
1175but it returns a I<GFF::GeneFeatureSet> object from the GFF.pm
1176module.  If the GFF module is not installed, this method will generate
1177a fatal error.
1178
1179=head2 absolute()
1180
1181 $abs = $seq->absolute;
1182 $abs = $seq->absolute(1);
1183
1184This method controls whether the coordinates of features are returned
1185in absolute or relative coordinates.  "Absolute" coordinates are
1186relative to the underlying source or reference sequence.  "Relative"
1187coordinates are relative to the I<Ace::Sequence> object.  By default,
1188coordinates are relative unless new() was provided with a reference
1189sequence.  This default can be examined and changed using absolute().
1190
1191=head2 automerge()
1192
1193  $merge = $seq->automerge;
1194  $seq->automerge(0);
1195
1196This method controls whether groups of features will automatically be
1197merged together by the features() call.  If true (the default), then
1198the left and right end of clones will be merged into "clone" features,
1199introns, exons and CDS entries will be merged into
1200Ace::Sequence::Transcript objects, and similarity entries will be
1201merged into Ace::Sequence::GappedAlignment objects.
1202
1203=head2 db()
1204
1205  $db = $seq->db;
1206
1207Returns the L<Ace> database accessor associated with this sequence.
1208
1209=head1 SEE ALSO
1210
1211L<Ace>, L<Ace::Object>, L<Ace::Sequence::Feature>,
1212L<Ace::Sequence::FeatureList>, L<GFF>
1213
1214=head1 AUTHOR
1215
1216Lincoln Stein <lstein@cshl.org> with extensive help from Jean
1217Thierry-Mieg <mieg@kaa.crbm.cnrs-mop.fr>
1218
1219Many thanks to David Block <dblock@gene.pbi.nrc.ca> for finding and
1220fixing the nasty off-by-one errors.
1221
1222Copyright (c) 1999, Lincoln D. Stein
1223
1224This library is free software; you can redistribute it and/or modify
1225it under the same terms as Perl itself.  See DISCLAIMER.txt for
1226disclaimers of warranty.
1227
1228=cut
1229
1230__END__
1231