1package Ace::Sequence; 2use strict; 3 4use Carp; 5use strict; 6use Ace 1.50 qw(:DEFAULT rearrange); 7use Ace::Sequence::FeatureList; 8use Ace::Sequence::Feature; 9use AutoLoader 'AUTOLOAD'; 10use vars '$VERSION'; 11my %CACHE; 12 13$VERSION = '1.51'; 14 15use constant CACHE => 1; 16 17use overload 18 '""' => 'asString', 19 cmp => 'cmp', 20; 21 22# synonym: stop = end 23*stop = \&end; 24*abs = \&absolute; 25*source_seq = \&source; 26*source_tag = \&subtype; 27*primary_tag = \&type; 28 29my %plusminus = ( '+' => '-', 30 '-' => '+', 31 '.' => '.'); 32 33# internal keys 34# parent => reference Sequence in "+" strand 35# p_offset => our start in the parent 36# length => our length 37# strand => our strand (+ or -) 38# refseq => reference Sequence for coordinate system 39 40# object constructor 41# usually called like this: 42# $seq = Ace::Sequence->new($object); 43# but can be called like this: 44# $seq = Ace::Sequence->new(-db=>$db,-name=>$name); 45# or 46# $seq = Ace::Sequence->new(-seq => $object, 47# -offset => $offset, 48# -length => $length, 49# -ref => $refseq 50# ); 51# $refseq, if provided, will be used to establish the coordinate 52# system. Otherwise the first base pair will be set to 1. 53sub new { 54 my $pack = shift; 55 my ($seq,$start,$end,$offset,$length,$refseq,$db) = 56 rearrange([ 57 ['SEQ','SEQUENCE','SOURCE'], 58 'START', 59 ['END','STOP'], 60 ['OFFSET','OFF'], 61 ['LENGTH','LEN'], 62 'REFSEQ', 63 ['DATABASE','DB'], 64 ],@_); 65 66 # Object must have a parent sequence and/or a reference 67 # sequence. In some cases, the parent sequence will be the 68 # object itself. The reference sequence is used to set up 69 # the frame of reference for the coordinate system. 70 71 # fetch the sequence object if we don't have it already 72 croak "Please provide either a Sequence object or a database and name" 73 unless ref($seq) || ($seq && $db); 74 75 # convert start into offset 76 $offset = $start - 1 if defined($start) and !defined($offset); 77 78 # convert stop/end into length 79 $length = ($end > $start) ? $end - $offset : $end - $offset - 2 80 if defined($end) && !defined($length); 81 82 # if just a string is passed, try to fetch a Sequence object 83 my $obj = ref($seq) ? $seq : $db->fetch('Sequence'=>$seq); 84 unless ($obj) { 85 Ace->error("No Sequence named $obj found in database"); 86 return; 87 } 88 89 # get parent coordinates and length of this sequence 90 # the parent is an Ace Sequence object in the "+" strand 91 my ($parent,$p_offset,$p_length,$strand) = find_parent($obj); 92 return unless $parent; 93 94 # handle negative strands 95 my $r_strand = $strand; 96 my $r_offset = $p_offset; 97 $offset ||= 0; 98 $offset *= -1 if $strand < 0; 99 100 # handle feature objects 101 $offset += $obj->offset if $obj->can('smapped'); 102 103 # get source 104 my $source = $obj->can('smapped') ? $obj->source : $obj; 105 106 # store the object into our instance variables 107 my $self = bless { 108 obj => $source, 109 offset => $offset, 110 length => $length || $p_length, 111 parent => $parent, 112 p_offset => $p_offset, 113 refseq => [$source,$r_offset,$r_strand], 114 strand => $strand, 115 absolute => 0, 116 automerge => 1, 117 },$pack; 118 119 # set the reference sequence 120 eval { $self->refseq($refseq) } or return if defined $refseq; 121 122 # wheww! 123 return $self; 124} 125 126# return the "source" object that the user offset from 127sub source { 128 $_[0]->{obj}; 129} 130 131# return the parent object 132sub parent { $_[0]->{parent} } 133 134# return the length 135#sub length { $_[0]->{length} } 136sub length { 137 my $self = shift; 138 my ($start,$end) = ($self->start,$self->end); 139 return $end - $start + ($end > $start ? 1 : -1); # for stupid 1-based adjustments 140} 141 142sub reversed { return shift->strand < 0; } 143 144sub automerge { 145 my $self = shift; 146 my $d = $self->{automerge}; 147 $self->{automerge} = shift if @_; 148 $d; 149} 150 151# return reference sequence 152sub refseq { 153 my $self = shift; 154 my $prev = $self->{refseq}; 155 if (@_) { 156 my $refseq = shift; 157 my $arrayref; 158 159 BLOCK: { 160 last BLOCK unless defined ($refseq); 161 162 if (ref($refseq) && ref($refseq) eq 'ARRAY') { 163 $arrayref = $refseq; 164 last BLOCK; 165 } 166 167 if (ref($refseq) && ($refseq->can('smapped'))) { 168 croak "Reference sequence has no common ancestor with sequence" 169 unless $self->parent eq $refseq->parent; 170 my ($a,$b,$c) = @{$refseq->{refseq}}; 171 # $b += $refseq->offset; 172 $b += $refseq->offset; 173 $arrayref = [$refseq,$b,$refseq->strand]; 174 last BLOCK; 175 } 176 177 178 # look up reference sequence in database if we aren't given 179 # database object already 180 $refseq = $self->db->fetch('Sequence' => $refseq) 181 unless $refseq->isa('Ace::Object'); 182 croak "Invalid reference sequence" unless $refseq; 183 184 # find position of ref sequence in parent strand 185 my ($r_parent,$r_offset,$r_length,$r_strand) = find_parent($refseq); 186 croak "Reference sequence has no common ancestor with sequence" 187 unless $r_parent eq $self->{parent}; 188 189 # set to array reference containing this information 190 $arrayref = [$refseq,$r_offset,$r_strand]; 191 } 192 $self->{refseq} = $arrayref; 193 } 194 return unless $prev; 195 return $self->parent if $self->absolute; 196 return wantarray ? @{$prev} : $prev->[0]; 197} 198 199# return strand 200sub strand { return $_[0]->{strand} } 201 202# return reference strand 203sub r_strand { 204 my $self = shift; 205 return "+1" if $self->absolute; 206 if (my ($ref,$r_offset,$r_strand) = $self->refseq) { 207 return $r_strand; 208 } else { 209 return $self->{strand} 210 } 211} 212 213sub offset { $_[0]->{offset} } 214sub p_offset { $_[0]->{p_offset} } 215 216sub smapped { 1; } 217sub type { 'Sequence' } 218sub subtype { } 219 220sub debug { 221 my $self = shift; 222 my $d = $self->{_debug}; 223 $self->{_debug} = shift if @_; 224 $d; 225} 226 227# return the database this sequence is associated with 228sub db { 229 return Ace->name2db($_[0]->{db} ||= $_[0]->source->db); 230} 231 232sub start { 233 my ($self,$abs) = @_; 234 $abs = $self->absolute unless defined $abs; 235 return $self->{p_offset} + $self->{offset} + 1 if $abs; 236 237 if ($self->refseq) { 238 my ($ref,$r_offset,$r_strand) = $self->refseq; 239 return $r_strand < 0 ? 1 + $r_offset - ($self->{p_offset} + $self->{offset}) 240 : 1 + $self->{p_offset} + $self->{offset} - $r_offset; 241 } 242 243 else { 244 return $self->{offset} +1; 245 } 246 247} 248 249sub end { 250 my ($self,$abs) = @_; 251 my $start = $self->start($abs); 252 my $f = $self->{length} > 0 ? 1 : -1; # for stupid 1-based adjustments 253 if ($abs && $self->refseq ne $self->parent) { 254 my $r_strand = $self->r_strand; 255 return $start - $self->{length} + $f 256 if $r_strand < 0 or $self->{strand} < 0 or $self->{length} < 0; 257 return $start + $self->{length} - $f 258 } 259 return $start + $self->{length} - $f if $self->r_strand eq $self->{strand}; 260 return $start - $self->{length} + $f; 261} 262 263# turn on absolute coordinates (relative to reference sequence) 264sub absolute { 265 my $self = shift; 266 my $prev = $self->{absolute}; 267 $self->{absolute} = $_[0] if defined $_[0]; 268 return $prev; 269} 270 271# human readable string (for debugging) 272sub asString { 273 my $self = shift; 274 if ($self->absolute) { 275 return join '',$self->parent,'/',$self->start,',',$self->end; 276 } elsif (my $ref = $self->refseq){ 277 my $label = $ref->isa('Ace::Sequence::Feature') ? $ref->info : "$ref"; 278 return join '',$label,'/',$self->start,',',$self->end; 279 280 } else { 281 join '',$self->source,'/',$self->start,',',$self->end; 282 } 283} 284 285sub cmp { 286 my ($self,$arg,$reversed) = @_; 287 if (ref($arg) and $arg->isa('Ace::Sequence')) { 288 my $cmp = $self->parent cmp $arg->parent 289 || $self->start <=> $arg->start; 290 return $reversed ? -$cmp : $cmp; 291 } 292 my $name = $self->asString; 293 return $reversed ? $arg cmp $name : $name cmp $arg; 294} 295 296# Return the DNA 297sub dna { 298 my $self = shift; 299 return $self->{dna} if $self->{dna}; 300 my $raw = $self->_query('seqdna'); 301 $raw=~s/^>.*\n//m; 302 $raw=~s/^\/\/.*//mg; 303 $raw=~s/\n//g; 304 $raw =~ s/\0+\Z//; # blasted nulls! 305 my $effective_strand = $self->end >= $self->start ? '+1' : '-1'; 306 _complement(\$raw) if $self->r_strand ne $effective_strand; 307 return $self->{dna} = $raw; 308} 309 310# return a gff file 311sub gff { 312 my $self = shift; 313 my ($abs,$features) = rearrange([['ABS','ABSOLUTE'],'FEATURES'],@_); 314 $abs = $self->absolute unless defined $abs; 315 316 # can provide list of feature names, such as 'similarity', or 'all' to get 'em all 317 # !THIS IS BROKEN; IT SHOULD LOOK LIKE FEATURE()! 318 my $opt = $self->_feature_filter($features); 319 320 my $gff = $self->_gff($opt); 321 warn $gff if $self->debug; 322 323 $self->transformGFF(\$gff) unless $abs; 324 return $gff; 325} 326 327# return a GFF object using the optional GFF.pm module 328sub GFF { 329 my $self = shift; 330 my ($filter,$converter) = @_; # anonymous subs 331 croak "GFF module not installed" unless require GFF; 332 require GFF::Filehandle; 333 334 my @lines = grep !/^\/\//,split "\n",$self->gff(@_); 335 local *IN; 336 local ($^W) = 0; # prevent complaint by GFF module 337 tie *IN,'GFF::Filehandle',\@lines; 338 my $gff = GFF::GeneFeatureSet->new; 339 $gff->read(\*IN,$filter,$converter) if $gff; 340 return $gff; 341} 342 343# Get the features table. Can filter by type/subtype this way: 344# features('similarity:EST','annotation:assembly_tag') 345sub features { 346 my $self = shift; 347 my ($filter,$opt) = $self->_make_filter(@_); 348 349 # get raw gff file 350 my $gff = $self->gff(-features=>$opt); 351 352 # turn it into a list of features 353 my @features = $self->_make_features($gff,$filter); 354 355 if ($self->automerge) { # automatic merging 356 # fetch out constructed transcripts and clones 357 my %types = map {lc($_)=>1} (@$opt,@_); 358 if ($types{'transcript'}) { 359 push @features,$self->_make_transcripts(\@features); 360 @features = grep {$_->type !~ /^(intron|exon)$/ } @features; 361 } 362 push @features,$self->_make_clones(\@features) if $types{'clone'}; 363 if ($types{'similarity'}) { 364 my @f = $self->_make_alignments(\@features); 365 @features = grep {$_->type ne 'similarity'} @features; 366 push @features,@f; 367 } 368 } 369 370 return wantarray ? @features : \@features; 371} 372 373# A little bit more complex - assemble a list of "transcripts" 374# consisting of Ace::Sequence::Transcript objects. These objects 375# contain a list of exons and introns. 376sub transcripts { 377 my $self = shift; 378 my $curated = shift; 379 my $ef = $curated ? "exon:curated" : "exon"; 380 my $if = $curated ? "intron:curated" : "intron"; 381 my $sf = $curated ? "Sequence:curated" : "Sequence"; 382 my @features = $self->features($ef,$if,$sf); 383 return unless @features; 384 return $self->_make_transcripts(\@features); 385} 386 387sub _make_transcripts { 388 my $self = shift; 389 my $features = shift; 390 391 require Ace::Sequence::Transcript; 392 my %transcripts; 393 394 for my $feature (@$features) { 395 my $transcript = $feature->info; 396 next unless $transcript; 397 if ($feature->type =~ /^(exon|intron|cds)$/) { 398 my $type = $1; 399 push @{$transcripts{$transcript}{$type}},$feature; 400 } elsif ($feature->type eq 'Sequence') { 401 $transcripts{$transcript}{base} ||= $feature; 402 } 403 } 404 405 # get rid of transcripts without exons 406 foreach (keys %transcripts) { 407 delete $transcripts{$_} unless exists $transcripts{$_}{exon} 408 } 409 410 # map the rest onto Ace::Sequence::Transcript objects 411 return map {Ace::Sequence::Transcript->new($transcripts{$_})} keys %transcripts; 412} 413 414# Reassemble clones from clone left and right ends 415sub clones { 416 my $self = shift; 417 my @clones = $self->features('Clone_left_end','Clone_right_end','Sequence'); 418 my %clones; 419 return unless @clones; 420 return $self->_make_clones(\@clones); 421} 422 423sub _make_clones { 424 my $self = shift; 425 my $features = shift; 426 427 my (%clones,@canonical_clones); 428 my $start_label = $self->strand < 0 ? 'end' : 'start'; 429 my $end_label = $self->strand < 0 ? 'start' : 'end'; 430 for my $feature (@$features) { 431 $clones{$feature->info}{$start_label} = $feature->start if $feature->type eq 'Clone_left_end'; 432 $clones{$feature->info}{$end_label} = $feature->start if $feature->type eq 'Clone_right_end'; 433 434 if ($feature->type eq 'Sequence') { 435 my $info = $feature->info; 436 next if $info =~ /LINK|CHROMOSOME|\.\w+$/; 437 if ($info->Genomic_canonical(0)) { 438 push (@canonical_clones,$info->Clone) if $info->Clone; 439 } 440 } 441 } 442 443 foreach (@canonical_clones) { 444 $clones{$_} ||= {}; 445 } 446 447 my @features; 448 my ($r,$r_offset,$r_strand) = $self->refseq; 449 my $parent = $self->parent; 450 my $abs = $self->absolute; 451 if ($abs) { 452 $r_offset = 0; 453 $r = $parent; 454 $r_strand = '+1'; 455 } 456 457 # BAD HACK ALERT. WE DON'T KNOW WHERE THE LEFT END OF THE CLONE IS SO WE USE 458 # THE MAGIC NUMBER -99_999_999 to mean "off left end" and 459 # +99_999_999 to mean "off right end" 460 for my $clone (keys %clones) { 461 my $start = $clones{$clone}{start} || -99_999_999; 462 my $end = $clones{$clone}{end} || +99_999_999; 463 my $phony_gff = join "\t",($parent,'Clone','structural',$start,$end,'.','.','.',qq(Clone "$clone")); 464 push @features,Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$phony_gff); 465 } 466 return @features; 467} 468 469# Assemble a list of "GappedAlignment" objects. These objects 470# contain a list of aligned segments. 471sub alignments { 472 my $self = shift; 473 my @subtypes = @_; 474 my @types = map { "similarity:\^$_\$" } @subtypes; 475 push @types,'similarity' unless @types; 476 return $self->features(@types); 477} 478 479sub segments { 480 my $self = shift; 481 return; 482} 483 484sub _make_alignments { 485 my $self = shift; 486 my $features = shift; 487 require Ace::Sequence::GappedAlignment; 488 489 my %homol; 490 491 for my $feature (@$features) { 492 next unless $feature->type eq 'similarity'; 493 my $target = $feature->info; 494 my $subtype = $feature->subtype; 495 push @{$homol{$target,$subtype}},$feature; 496 } 497 498 # map onto Ace::Sequence::GappedAlignment objects 499 return map {Ace::Sequence::GappedAlignment->new($homol{$_})} keys %homol; 500} 501 502# return list of features quickly 503sub feature_list { 504 my $self = shift; 505 return $self->{'feature_list'} if $self->{'feature_list'}; 506 return unless my $raw = $self->_query('seqfeatures -version 2 -list'); 507 return $self->{'feature_list'} = Ace::Sequence::FeatureList->new($raw); 508} 509 510# transform a GFF file into the coordinate system of the sequence 511sub transformGFF { 512 my $self = shift; 513 my $gff = shift; 514 my $parent = $self->parent; 515 my $strand = $self->{strand}; 516 my $source = $self->source; 517 my ($ref_source,$ref_offset,$ref_strand) = $self->refseq; 518 $ref_source ||= $source; 519 $ref_strand ||= $strand; 520 521 if ($ref_strand > 0) { 522 my $o = defined($ref_offset) ? $ref_offset : ($self->p_offset + $self->offset); 523 # find anything that looks like a numeric field and subtract offset from it 524 $$gff =~ s/(?<!\")\s+(-?\d+)\s+(-?\d+)/"\t" . ($1 - $o) . "\t" . ($2 - $o)/eg; 525 $$gff =~ s/^$parent/$source/mg; 526 $$gff =~ s/\#\#sequence-region\s+\S+/##sequence-region $ref_source/m; 527 $$gff =~ s/FMAP_FEATURES\s+"\S+"/FMAP_FEATURES "$ref_source"/m; 528 return; 529 } else { # strand eq '-' 530 my $o = defined($ref_offset) ? (2 + $ref_offset) : (2 + $self->p_offset - $self->offset); 531 $$gff =~ s/(?<!\")\s+(-?\d+)\s+(-?\d+)\s+([.\d]+)\s+(\S)/join "\t",'',$o-$2,$o-$1,$3,$plusminus{$4}/eg; 532 $$gff =~ s/(Target \"[^\"]+\" )(-?\d+) (-?\d+)/$1 $3 $2/g; 533 $$gff =~ s/^$parent/$source/mg; 534 $$gff =~ s/\#\#sequence-region\s+\S+\s+(-?\d+)\s+(-?\d+)/"##sequence-region $ref_source " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em; 535 $$gff =~ s/FMAP_FEATURES\s+"\S+"\s+(-?\d+)\s+(-?\d+)/"FMAP_FEATURES \"$ref_source\" " . ($o - $2) . ' ' . ($o - $1) . ' (reversed)'/em; 536 } 537 538} 539 540# return a name for the object 541sub name { 542 return shift->source_seq->name; 543} 544 545# for compatibility with Ace::Sequence::Feature 546sub info { 547 return shift->source_seq; 548} 549 550###################### internal functions ################# 551# not necessarily object-oriented!! 552 553# return parent, parent offset and strand 554sub find_parent { 555 my $obj = shift; 556 557 # first, if we are passed an Ace::Sequence, then we can inherit 558 # these settings directly 559 return (@{$obj}{qw(parent p_offset length)},$obj->r_strand) 560 if $obj->isa('Ace::Sequence'); 561 562 # otherwise, if we are passed an Ace::Object, then we must 563 # traverse upwards until we find a suitable parent 564 return _traverse($obj) if $obj->isa('Ace::Object'); 565 566 # otherwise, we don't know what to do... 567 croak "Source sequence not an Ace::Object or an Ace::Sequence"; 568} 569 570sub _get_parent { 571 my $obj = shift; 572 # ** DANGER DANGER WILL ROBINSON! ** 573 # This is an experiment in caching parents to speed lookups. Probably eats memory voraciously. 574 return $CACHE{$obj} if CACHE && exists $CACHE{$obj}; 575 my $p = $obj->get(S_Parent=>2)|| $obj->get(Source=>1); 576 return unless $p; 577 return CACHE ? $CACHE{$obj} = $p->fetch 578 : $p->fetch; 579} 580 581sub _get_children { 582 my $obj = shift; 583 my @pieces = $obj->get(S_Child=>2); 584 return @pieces if @pieces; 585 return @pieces = $obj->get('Subsequence'); 586} 587 588# get sequence, offset and strand of topmost container 589sub _traverse { 590 my $obj = shift; 591 my ($offset,$length); 592 593 # invoke seqget to find the top-level container for this sequence 594 my ($tl,$tl_start,$tl_end) = _get_toplevel($obj); 595 $tl_start ||= 0; 596 $tl_end ||= 0; 597 598 # make it an object 599 $tl = ref($obj)->new(-name=>$tl,-class=>'Sequence',-db=>$obj->db); 600 601 $offset += $tl_start - 1; # offset to beginning of toplevel 602 $length ||= abs($tl_end - $tl_start) + 1; 603 my $strand = $tl_start < $tl_end ? +1 : -1; 604 605 return ($tl,$offset,$strand < 0 ? ($length,'-1') : ($length,'+1') ) if $length; 606} 607 608sub _get_toplevel { 609 my $obj = shift; 610 my $class = $obj->class; 611 my $name = $obj->name; 612 613 my $smap = $obj->db->raw_query("gif smap -from $class:$name"); 614 my ($parent,$pstart,$pstop,$tstart,$tstop,$map_type) = 615 $smap =~ /^SMAP\s+(\S+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(\d+)\s+(.+)/; 616 617 $parent ||= ''; 618 $parent =~ s/^Sequence://; # remove this in next version of Acedb 619 return ($parent,$pstart,$pstop); 620} 621 622# create subroutine that filters GFF files for certain feature types 623sub _make_filter { 624 my $self = shift; 625 my $automerge = $self->automerge; 626 627 # parse out the filter 628 my %filter; 629 foreach (@_) { 630 my ($type,$filter) = split(':',$_,2); 631 if ($automerge && lc($type) eq 'transcript') { 632 @filter{'exon','intron','Sequence','cds'} = ([undef],[undef],[undef],[undef]); 633 } elsif ($automerge && lc($type) eq 'clone') { 634 @filter{'Clone_left_end','Clone_right_end','Sequence'} = ([undef],[undef],[undef]); 635 } else { 636 push @{$filter{$type}},$filter; 637 } 638 } 639 640 # create pattern-match sub 641 my $sub; 642 my $promiscuous; # indicates that there is a subtype without a type 643 644 if (%filter) { 645 my $s = "sub { my \@d = split(\"\\t\",\$_[0]);\n"; 646 for my $type (keys %filter) { 647 my $expr; 648 my $subtypes = $filter{$type}; 649 if ($type ne '') { 650 for my $st (@$subtypes) { 651 $expr .= defined $st ? "return 1 if \$d[2]=~/$type/i && \$d[1]=~/$st/i;\n" 652 : "return 1 if \$d[2]=~/$type/i;\n" 653 } 654 } else { # no type, only subtypes 655 $promiscuous++; 656 for my $st (@$subtypes) { 657 next unless defined $st; 658 $expr .= "return 1 if \$d[1]=~/$st/i;\n"; 659 } 660 } 661 $s .= $expr; 662 } 663 $s .= "return;\n }"; 664 665 $sub = eval $s; 666 croak $@ if $@; 667 } else { 668 $sub = sub { 1; } 669 } 670 return ($sub,$promiscuous ? [] : [keys %filter]); 671} 672 673# turn a GFF file and a filter into a list of Ace::Sequence::Feature objects 674sub _make_features { 675 my $self = shift; 676 my ($gff,$filter) = @_; 677 678 my ($r,$r_offset,$r_strand) = $self->refseq; 679 my $parent = $self->parent; 680 my $abs = $self->absolute; 681 if ($abs) { 682 $r_offset = 0; 683 $r = $parent; 684 $r_strand = '+1'; 685 } 686 my @features = map {Ace::Sequence::Feature->new($parent,$r,$r_offset,$r_strand,$abs,$_)} 687 grep !m@^(?:\#|//)@ && $filter->($_),split("\n",$gff); 688} 689 690 691# low level GFF call, no changing absolute to relative coordinates 692sub _gff { 693 my $self = shift; 694 my ($opt,$db) = @_; 695 my $data = $self->_query("seqfeatures -version 2 $opt",$db); 696 $data =~ s/\0+\Z//; 697 return $data; #blasted nulls! 698} 699 700# shortcut for running a gif query 701sub _query { 702 my $self = shift; 703 my $command = shift; 704 my $db = shift || $self->db; 705 706 my $parent = $self->parent; 707 my $start = $self->start(1); 708 my $end = $self->end(1); 709 ($start,$end) = ($end,$start) if $start > $end; #flippity floppity 710 711 my $coord = "-coords $start $end"; 712 713 # BAD BAD HACK ALERT - CHECKS THE QUERY THAT IS PASSED DOWN 714 # ALSO MAKES THINGS INCOMPATIBLE WITH PRIOR 4.9 servers. 715# my $opt = $command =~ /seqfeatures/ ? '-nodna' : ''; 716 my $opt = '-noclip'; 717 718 my $query = "gif seqget $parent $opt $coord ; $command"; 719 warn $query if $self->debug; 720 721 return $db->raw_query("gif seqget $parent $opt $coord ; $command"); 722} 723 724# utility function -- reverse complement 725sub _complement { 726 my $dna = shift; 727 $$dna =~ tr/GATCgatc/CTAGctag/; 728 $$dna = scalar reverse $$dna; 729} 730 731sub _feature_filter { 732 my $self = shift; 733 my $features = shift; 734 return '' unless $features; 735 my $opt = ''; 736 $opt = '+feature ' . join('|',@$features) if ref($features) eq 'ARRAY' && @$features; 737 $opt = "+feature $features" unless ref $features; 738 $opt; 739} 740 7411; 742 743=head1 NAME 744 745Ace::Sequence - Examine ACeDB Sequence Objects 746 747=head1 SYNOPSIS 748 749 # open database connection and get an Ace::Object sequence 750 use Ace::Sequence; 751 752 $db = Ace->connect(-host => 'stein.cshl.org',-port => 200005); 753 $obj = $db->fetch(Predicted_gene => 'ZK154.3'); 754 755 # Wrap it in an Ace::Sequence object 756 $seq = Ace::Sequence->new($obj); 757 758 # Find all the exons 759 @exons = $seq->features('exon'); 760 761 # Find all the exons predicted by various versions of "genefinder" 762 @exons = $seq->features('exon:genefinder.*'); 763 764 # Iterate through the exons, printing their start, end and DNA 765 for my $exon (@exons) { 766 print join "\t",$exon->start,$exon->end,$exon->dna,"\n"; 767 } 768 769 # Find the region 1000 kb upstream of the first exon 770 $sub = Ace::Sequence->new(-seq=>$exons[0], 771 -offset=>-1000,-length=>1000); 772 773 # Find all features in that area 774 @features = $sub->features; 775 776 # Print its DNA 777 print $sub->dna; 778 779 # Create a new Sequence object from the first 500 kb of chromosome 1 780 $seq = Ace::Sequence->new(-name=>'CHROMOSOME_I',-db=>$db, 781 -offset=>0,-length=>500_000); 782 783 # Get the GFF dump as a text string 784 $gff = $seq->gff; 785 786 # Limit dump to Predicted_genes 787 $gff_genes = $seq->gff(-features=>'Predicted_gene'); 788 789 # Return a GFF object (using optional GFF.pm module from Sanger) 790 $gff_obj = $seq->GFF; 791 792=head1 DESCRIPTION 793 794I<Ace::Sequence>, and its allied classes L<Ace::Sequence::Feature> and 795L<Ace::Sequence::FeatureList>, provide a convenient interface to the 796ACeDB Sequence classes and the GFF sequence feature file format. 797 798Using this class, you can define a region of the genome by using a 799landmark (sequenced clone, link, superlink, predicted gene), an offset 800from that landmark, and a distance. Offsets and distances can be 801positive or negative. This will return an I<Ace::Sequence> object. 802Once a region is defined, you may retrieve its DNA sequence, or query 803the database for any features that may be contained within this 804region. Features can be returned as objects (using the 805I<Ace::Sequence::Feature> class), as GFF text-only dumps, or in the 806form of the GFF class defined by the Sanger Centre's GFF.pm module. 807 808This class builds on top of L<Ace> and L<Ace::Object>. Please see 809their manual pages before consulting this one. 810 811=head1 Creating New Ace::Sequence Objects, the new() Method 812 813 $seq = Ace::Sequence->new($object); 814 815 $seq = Ace::Sequence->new(-source => $object, 816 -offset => $offset, 817 -length => $length, 818 -refseq => $reference_sequence); 819 820 $seq = Ace::Sequence->new(-name => $name, 821 -db => $db, 822 -offset => $offset, 823 -length => $length, 824 -refseq => $reference_sequence); 825 826In order to create an I<Ace::Sequence> you will need an active I<Ace> 827database accessor. Sequence regions are defined using a "source" 828sequence, an offset, and a length. Optionally, you may also provide a 829"reference sequence" to establish the coordinate system for all 830inquiries. Sequences may be generated from existing I<Ace::Object> 831sequence objects, from other I<Ace::Sequence> and 832I<Ace::Sequence::Feature> objects, or from a sequence name and a 833database handle. 834 835The class method named new() is the interface to these facilities. In 836its simplest, one-argument form, you provide new() with a 837previously-created I<Ace::Object> that points to Sequence or 838sequence-like object (the meaning of "sequence-like" is explained in 839more detail below.) The new() method will return an I<Ace::Sequence> 840object extending from the beginning of the object through to its 841natural end. 842 843In the named-parameter form of new(), the following arguments are 844recognized: 845 846=over 4 847 848=item -source 849 850The sequence source. This must be an I<Ace::Object> of the "Sequence" 851class, or be a sequence-like object containing the SMap tag (see 852below). 853 854=item -offset 855 856An offset from the beginning of the source sequence. The retrieved 857I<Ace::Sequence> will begin at this position. The offset can be any 858positive or negative integer. Offets are B<0-based>. 859 860=item -length 861 862The length of the sequence to return. Either a positive or negative 863integer can be specified. If a negative length is given, the returned 864sequence will be complemented relative to the source sequence. 865 866=item -refseq 867 868The sequence to use to establish the coordinate system for the 869returned sequence. Normally the source sequence is used to establish 870the coordinate system, but this can be used to override that choice. 871You can provide either an I<Ace::Object> or just a sequence name for 872this argument. The source and reference sequences must share a common 873ancestor, but do not have to be directly related. An attempt to use a 874disjunct reference sequence, such as one on a different chromosome, 875will fail. 876 877=item -name 878 879As an alternative to using an I<Ace::Object> with the B<-source> 880argument, you may specify a source sequence using B<-name> and B<-db>. 881The I<Ace::Sequence> module will use the provided database accessor to 882fetch a Sequence object with the specified name. new() will return 883undef is no Sequence by this name is known. 884 885=item -db 886 887This argument is required if the source sequence is specified by name 888rather than by object reference. 889 890=back 891 892If new() is successful, it will create an I<Ace::Sequence> object and 893return it. Otherwise it will return undef and return a descriptive 894message in Ace->error(). Certain programming errors, such as a 895failure to provide required arguments, cause a fatal error. 896 897=head2 Reference Sequences and the Coordinate System 898 899When retrieving information from an I<Ace::Sequence>, the coordinate 900system is based on the sequence segment selected at object creation 901time. That is, the "+1" strand is the natural direction of the 902I<Ace::Sequence> object, and base pair 1 is its first base pair. This 903behavior can be overridden by providing a reference sequence to the 904new() method, in which case the orientation and position of the 905reference sequence establishes the coordinate system for the object. 906 907In addition to the reference sequence, there are two other sequences 908used by I<Ace::Sequence> for internal bookeeping. The "source" 909sequence corresponds to the smallest ACeDB sequence object that 910completely encloses the selected sequence segment. The "parent" 911sequence is the smallest ACeDB sequence object that contains the 912"source". The parent is used to derive the length and orientation of 913source sequences that are not directly associated with DNA objects. 914 915In many cases, the source sequence will be identical to the sequence 916initially passed to the new() method. However, there are exceptions 917to this rule. One common exception occurs when the offset and/or 918length cross the boundaries of the passed-in sequence. In this case, 919the ACeDB database is searched for the smallest sequence that contains 920both endpoints of the I<Ace::Sequence> object. 921 922The other common exception occurs in Ace 4.8, where there is support 923for "sequence-like" objects that contain the C<SMap> ("Sequence Map") 924tag. The C<SMap> tag provides genomic location information for 925arbitrary object -- not just those descended from the Sequence class. 926This allows ACeDB to perform genome map operations on objects that are 927not directly related to sequences, such as genetic loci that have been 928interpolated onto the physical map. When an C<SMap>-containing object 929is passed to the I<Ace::Sequence> new() method, the module will again 930choose the smallest ACeDB Sequence object that contains both 931end-points of the desired region. 932 933If an I<Ace::Sequence> object is used to create a new I<Ace::Sequence> 934object, then the original object's source is inherited. 935 936=head1 Object Methods 937 938Once an I<Ace::Sequence> object is created, you can query it using the 939following methods: 940 941=head2 asString() 942 943 $name = $seq->asString; 944 945Returns a human-readable identifier for the sequence in the form 946I<Source/start-end>, where "Source" is the name of the source 947sequence, and "start" and "end" are the endpoints of the sequence 948relative to the source (using 1-based indexing). This method is 949called automatically when the I<Ace::Sequence> is used in a string 950context. 951 952=head2 source_seq() 953 954 $source = $seq->source_seq; 955 956Return the source of the I<Ace::Sequence>. 957 958=head2 parent_seq() 959 960 $parent = $seq->parent_seq; 961 962Return the immediate ancestor of the sequence. The parent of the 963top-most sequence (such as the CHROMOSOME link) is itself. This 964method is used internally to ascertain the length of source sequences 965which are not associated with a DNA object. 966 967NOTE: this procedure is a trifle funky and cannot reliably be used to 968traverse upwards to the top-most sequence. The reason for this is 969that it will return an I<Ace::Sequence> in some cases, and an 970I<Ace::Object> in others. Use get_parent() to traverse upwards 971through a uniform series of I<Ace::Sequence> objects upwards. 972 973=head2 refseq([$seq]) 974 975 $refseq = $seq->refseq; 976 977Returns the reference sequence, if one is defined. 978 979 $seq->refseq($new_ref); 980 981Set the reference sequence. The reference sequence must share the same 982ancestor with $seq. 983 984=head2 start() 985 986 $start = $seq->start; 987 988Start of this sequence, relative to the source sequence, using 1-based 989indexing. 990 991=head2 end() 992 993 $end = $seq->end; 994 995End of this sequence, relative to the source sequence, using 1-based 996indexing. 997 998=head2 offset() 999 1000 $offset = $seq->offset; 1001 1002Offset of the beginning of this sequence relative to the source 1003sequence, using 0-based indexing. The offset may be negative if the 1004beginning of the sequence is to the left of the beginning of the 1005source sequence. 1006 1007=head2 length() 1008 1009 $length = $seq->length; 1010 1011The length of this sequence, in base pairs. The length may be 1012negative if the sequence's orientation is reversed relative to the 1013source sequence. Use abslength() to obtain the absolute value of 1014the sequence length. 1015 1016=head2 abslength() 1017 1018 $length = $seq->abslength; 1019 1020Return the absolute value of the length of the sequence. 1021 1022=head2 strand() 1023 1024 $strand = $seq->strand; 1025 1026Returns +1 for a sequence oriented in the natural direction of the 1027genomic reference sequence, or -1 otherwise. 1028 1029=head2 reversed() 1030 1031Returns true if the segment is reversed relative to the canonical 1032genomic direction. This is the same as $seq->strand < 0. 1033 1034=head2 dna() 1035 1036 $dna = $seq->dna; 1037 1038Return the DNA corresponding to this sequence. If the sequence length 1039is negative, the reverse complement of the appropriate segment will be 1040returned. 1041 1042ACeDB allows Sequences to exist without an associated DNA object 1043(which typically happens during intermediate stages of a sequencing 1044project. In such a case, the returned sequence will contain the 1045correct number of "-" characters. 1046 1047=head2 name() 1048 1049 $name = $seq->name; 1050 1051Return the name of the source sequence as a string. 1052 1053=head2 get_parent() 1054 1055 $parent = $seq->parent; 1056 1057Return the immediate ancestor of this I<Ace::Sequence> (i.e., the 1058sequence that contains this one). The return value is a new 1059I<Ace::Sequence> or undef, if no parent sequence exists. 1060 1061=head2 get_children() 1062 1063 @children = $seq->get_children(); 1064 1065Returns all subsequences that exist as independent objects in the 1066ACeDB database. What exactly is returned is dependent on the data 1067model. In older ACeDB databases, the only subsequences are those 1068under the catchall Subsequence tag. In newer ACeDB databases, the 1069objects returned correspond to objects to the right of the S_Child 1070subtag using a tag[2] syntax, and may include Predicted_genes, 1071Sequences, Links, or other objects. The return value is a list of 1072I<Ace::Sequence> objects. 1073 1074=head2 features() 1075 1076 @features = $seq->features; 1077 @features = $seq->features('exon','intron','Predicted_gene'); 1078 @features = $seq->features('exon:GeneFinder','Predicted_gene:hand.*'); 1079 1080features() returns an array of I<Sequence::Feature> objects. If 1081called without arguments, features() returns all features that cross 1082the sequence region. You may also provide a filter list to select a 1083set of features by type and subtype. The format of the filter list 1084is: 1085 1086 type:subtype 1087 1088Where I<type> is the class of the feature (the "feature" field of the 1089GFF format), and I<subtype> is a description of how the feature was 1090derived (the "source" field of the GFF format). Either of these 1091fields can be absent, and either can be a regular expression. More 1092advanced filtering is not supported, but is provided by the Sanger 1093Centre's GFF module. 1094 1095The order of the features in the returned list is not specified. To 1096obtain features sorted by position, use this idiom: 1097 1098 @features = sort { $a->start <=> $b->start } $seq->features; 1099 1100=head2 feature_list() 1101 1102 my $list = $seq->feature_list(); 1103 1104This method returns a summary list of the features that cross the 1105sequence in the form of a L<Ace::Feature::List> object. From the 1106L<Ace::Feature::List> object you can obtain the list of feature names 1107and the number of each type. The feature list is obtained from the 1108ACeDB server with a single short transaction, and therefore has much 1109less overhead than features(). 1110 1111See L<Ace::Feature::List> for more details. 1112 1113=head2 transcripts() 1114 1115This returns a list of Ace::Sequence::Transcript objects, which are 1116specializations of Ace::Sequence::Feature. See L<Ace::Sequence::Transcript> 1117for details. 1118 1119=head2 clones() 1120 1121This returns a list of Ace::Sequence::Feature objects containing 1122reconstructed clones. This is a nasty hack, because ACEDB currently 1123records clone ends, but not the clones themselves, meaning that we 1124will not always know both ends of the clone. In this case the missing 1125end has a synthetic position of -99,999,999 or +99,999,999. Sorry. 1126 1127=head2 gff() 1128 1129 $gff = $seq->gff(); 1130 $gff = $seq->gff(-abs => 1, 1131 -features => ['exon','intron:GeneFinder']); 1132 1133This method returns a GFF file as a scalar. The following arguments 1134are optional: 1135 1136=over 4 1137 1138=item -abs 1139 1140Ordinarily the feature entries in the GFF file will be returned in 1141coordinates relative to the start of the I<Ace::Sequence> object. 1142Position 1 will be the start of the sequence object, and the "+" 1143strand will be the sequence object's natural orientation. However if 1144a true value is provided to B<-abs>, the coordinate system used will 1145be relative to the start of the source sequence, i.e. the native ACeDB 1146Sequence object (usually a cosmid sequence or a link). 1147 1148If a reference sequence was provided when the I<Ace::Sequence> was 1149created, it will be used by default to set the coordinate system. 1150Relative coordinates can be reenabled by providing a false value to 1151B<-abs>. 1152 1153Ordinarily the coordinate system manipulations automatically "do what 1154you want" and you will not need to adjust them. See also the abs() 1155method described below. 1156 1157=item -features 1158 1159The B<-features> argument filters the features according to a list of 1160types and subtypes. The format is identical to the one described for 1161the features() method. A single filter may be provided as a scalar 1162string. Multiple filters may be passed as an array reference. 1163 1164=back 1165 1166See also the GFF() method described next. 1167 1168=head2 GFF() 1169 1170 $gff_object = $seq->gff; 1171 $gff_object = $seq->gff(-abs => 1, 1172 -features => ['exon','intron:GeneFinder']); 1173 1174The GFF() method takes the same arguments as gff() described above, 1175but it returns a I<GFF::GeneFeatureSet> object from the GFF.pm 1176module. If the GFF module is not installed, this method will generate 1177a fatal error. 1178 1179=head2 absolute() 1180 1181 $abs = $seq->absolute; 1182 $abs = $seq->absolute(1); 1183 1184This method controls whether the coordinates of features are returned 1185in absolute or relative coordinates. "Absolute" coordinates are 1186relative to the underlying source or reference sequence. "Relative" 1187coordinates are relative to the I<Ace::Sequence> object. By default, 1188coordinates are relative unless new() was provided with a reference 1189sequence. This default can be examined and changed using absolute(). 1190 1191=head2 automerge() 1192 1193 $merge = $seq->automerge; 1194 $seq->automerge(0); 1195 1196This method controls whether groups of features will automatically be 1197merged together by the features() call. If true (the default), then 1198the left and right end of clones will be merged into "clone" features, 1199introns, exons and CDS entries will be merged into 1200Ace::Sequence::Transcript objects, and similarity entries will be 1201merged into Ace::Sequence::GappedAlignment objects. 1202 1203=head2 db() 1204 1205 $db = $seq->db; 1206 1207Returns the L<Ace> database accessor associated with this sequence. 1208 1209=head1 SEE ALSO 1210 1211L<Ace>, L<Ace::Object>, L<Ace::Sequence::Feature>, 1212L<Ace::Sequence::FeatureList>, L<GFF> 1213 1214=head1 AUTHOR 1215 1216Lincoln Stein <lstein@cshl.org> with extensive help from Jean 1217Thierry-Mieg <mieg@kaa.crbm.cnrs-mop.fr> 1218 1219Many thanks to David Block <dblock@gene.pbi.nrc.ca> for finding and 1220fixing the nasty off-by-one errors. 1221 1222Copyright (c) 1999, Lincoln D. Stein 1223 1224This library is free software; you can redistribute it and/or modify 1225it under the same terms as Perl itself. See DISCLAIMER.txt for 1226disclaimers of warranty. 1227 1228=cut 1229 1230__END__ 1231