1# 2# BioPerl module for Bio::Location::Fuzzy 3# Please direct questions and support issues to <bioperl-l@bioperl.org> 4# 5# Cared for by Jason Stajich <jason@bioperl.org> 6# 7# Copyright Jason Stajich 8# 9# You may distribute this module under the same terms as perl itself 10# POD documentation - main docs before the code 11 12=head1 NAME 13 14Bio::Location::Fuzzy - Implementation of a Location on a Sequence 15which has unclear start and/or end locations 16 17=head1 SYNOPSIS 18 19 use Bio::Location::Fuzzy; 20 my $fuzzylocation = Bio::Location::Fuzzy->new( 21 -start => '<30', 22 -end => 90, 23 -location_type => '..'); 24 25 print "location string is ", $fuzzylocation->to_FTstring(), "\n"; 26 print "location is of the type ", $fuzzylocation->location_type, "\n"; 27 28=head1 DESCRIPTION 29 30This module contains the necessary methods for representing a 31Fuzzy Location, one that does not have clear start and/or end points. 32This will initially serve to handle features from Genbank/EMBL feature 33tables that are written as 1^100 meaning between bases 1 and 100 or 34E<lt>100..300 meaning it starts somewhere before 100. Advanced 35implementations of this interface may be able to handle the necessary 36logic of overlaps/intersection/contains/union. It was constructed to 37handle fuzzy locations that can be represented in Genbank/EMBL and 38Swissprot. 39 40=head1 FEEDBACK 41 42User feedback is an integral part of the evolution of this and other 43Bioperl modules. Send your comments and suggestions preferably to one 44of the Bioperl mailing lists. Your participation is much appreciated. 45 46 bioperl-l@bioperl.org - General discussion 47 http://bioperl.org/wiki/Mailing_lists - About the mailing lists 48 49=head2 Support 50 51Please direct usage questions or support issues to the mailing list: 52 53I<bioperl-l@bioperl.org> 54 55rather than to the module maintainer directly. Many experienced and 56reponsive experts will be able look at the problem and quickly 57address it. Please include a thorough description of the problem 58with code and data examples if at all possible. 59 60=head2 Reporting Bugs 61 62Report bugs to the Bioperl bug tracking system to help us keep track 63the bugs and their resolution. Bug reports can be submitted via the 64web: 65 66 https://github.com/bioperl/bioperl-live/issues 67 68=head1 AUTHOR - Jason Stajich 69 70Email jason-at-bioperl-dot-org 71 72=head1 APPENDIX 73 74The rest of the documentation details each of the object 75methods. Internal methods are usually preceded with a _ 76 77=cut 78 79# Let the code begin... 80 81package Bio::Location::Fuzzy; 82$Bio::Location::Fuzzy::VERSION = '1.7.7'; 83use strict; 84 85use base qw(Bio::Location::Atomic Bio::Location::FuzzyLocationI); 86 87our @LOCATIONCODESBSANE = (undef, 'EXACT', 'WITHIN', 'BETWEEN', 'UNCERTAIN', 88 'BEFORE', 'AFTER'); 89 90our %FUZZYCODES = ( 'EXACT' => '..', # Position is 'exact 91 # Exact position is unknown, but is within the range specified, ((1.2)..100) 92 'WITHIN' => '.', 93 # 1^2 94 'BETWEEN' => '^', 95 'IN-BETWEEN' => '^', 96 'UNCERTAIN' => '?', 97 # <100 98 'BEFORE' => '<', 99 # >10 100 'AFTER' => '>'); 101 102 # The following regular expressions map to fuzzy location types. Every 103 # expression must match the complete encoded point string, and must 104 # contain two groups identifying min and max. Empty matches are automatic. 105 # converted to undef, except for 'EXACT', for which max is set to equal 106 # min. 107 108our %FUZZYPOINTENCODE = ( 109 '\>(\d+)(.{0})' => 'AFTER', 110 '\<(.{0})(\d+)' => 'BEFORE', 111 '(\d+)' => 'EXACT', 112 '\?(\d*)' => 'UNCERTAIN', 113 '(\d+)(.{0})\>' => 'AFTER', 114 '(.{0})(\d+)\<' => 'BEFORE', 115 '(\d+)\.(\d+)' => 'WITHIN', 116 '(\d+)\^(\d+)' => 'BETWEEN', 117 ); 118 119our %FUZZYRANGEENCODE = ( '\.' => 'WITHIN', 120 '\.\.' => 'EXACT', 121 '\^' => 'IN-BETWEEN' ); 122 123=head2 new 124 125 Title : new 126 Usage : my $fuzzyloc = Bio::Location::Fuzzy->new( @args); 127 Function: 128 Returns : 129 Args : -start => value for start (initialize by superclass) 130 -end => value for end (initialize by superclass) 131 -strand => value for strand (initialize by superclass) 132 -location_type => either ('EXACT','WITHIN','IN-BETWEEN', 133 'UNCERTAIN') OR ( 1,2,3,4) 134 -start_ext=> extension for start - defaults to 0, 135 -start_fuz= fuzzy code for start can be 136 ('EXACT','WITHIN','BETWEEN','BEFORE','AFTER', 137 'UNCERTAIN' ) OR 138 a value 1 - 5 corresponding to index+1 above 139 -end_ext=> extension for end - defaults to 0, 140 -end_fuz= fuzzy code for end can be 141 ('EXACT','WITHIN','BETWEEN','BEFORE','AFTER', 142 'UNCERTAIN') OR 143 a value 1 - 5 corresponding to index+1 above 144 145=cut 146 147sub new { 148 my ($class, @args) = @_; 149 my $self = $class->SUPER::new(@args); 150 my ($location_type, $start_ext, $start_fuz, $end_ext, $end_fuz) = 151 $self->_rearrange([ qw(LOCATION_TYPE START_EXT START_FUZ 152 END_EXT END_FUZ ) 153 ], @args); 154 155 $location_type && $self->location_type($location_type); 156 $start_ext && $self->max_start($self->min_start + $start_ext); 157 $end_ext && $self->max_end($self->min_end + $end_ext); 158 $start_fuz && $self->start_pos_type($start_fuz); 159 $end_fuz && $self->end_pos_type($end_fuz); 160 161 return $self; 162} 163 164=head2 location_type 165 166 Title : location_type 167 Usage : my $location_type = $location->location_type(); 168 Function: Get location type encoded as text 169 Returns : string ('EXACT', 'WITHIN', 'IN-BETWEEN', 'UNCERTAIN') 170 Args : none 171 172=cut 173 174sub location_type { 175 my ($self,$value) = @_; 176 if( defined $value || ! defined $self->{'_location_type'} ) { 177 $value = 'EXACT' unless defined $value; 178 if(! defined $FUZZYCODES{$value} ) { 179 $value = uc($value); 180 if( $value =~ /\.\./ ) { 181 $value = 'EXACT'; 182 } elsif( $value =~ /^\.$/ ) { 183 $value = 'WITHIN'; 184 } elsif( $value =~ /\^/ ) { 185 $value = 'IN-BETWEEN'; 186 $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". 187 $self->start. "] and [". $self->end. "]") 188 if defined $self->start && defined $self->end && 189 ($self->end - 1 == $self->start); 190 } elsif( $value =~ /\?/ ) { 191 $value = 'UNCERTAIN'; 192 } elsif( $value ne 'EXACT' && $value ne 'WITHIN' && 193 $value ne 'IN-BETWEEN' ) { 194 $self->throw("Did not specify a valid location type"); 195 } 196 } 197 $self->{'_location_type'} = $value; 198 } 199 return $self->{'_location_type'}; 200} 201 202=head1 LocationI methods 203 204=head2 length 205 206 Title : length 207 Usage : $length = $fuzzy_loc->length(); 208 Function: Get the length of this location. 209 210 Note that the length of a fuzzy location will always depend 211 on the currently active interpretation of start and end. The 212 result will therefore vary for different CoordinatePolicy objects. 213 214 Returns : an integer 215 Args : none 216 217=cut 218 219#sub length { 220# my($self) = @_; 221# return $self->SUPER::length() if( !$self->start || !$self->end); 222# $self->warn('Length is not valid for a FuzzyLocation'); 223# return 0; 224#} 225 226=head2 start 227 228 Title : start 229 Usage : $start = $fuzzy->start(); 230 Function: get/set start of this range, handling fuzzy_starts 231 Returns : a positive integer representing the start of the location 232 Args : start location on set (can be fuzzy point string) 233 234=cut 235 236sub start { 237 my($self,$value) = @_; 238 if( defined $value ) { 239 my ($encode,$min,$max) = $self->_fuzzypointdecode($value); 240 $self->start_pos_type($encode); 241 $self->min_start($min); 242 $self->max_start($max); 243 } 244 245 $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [" 246 . $self->SUPER::start. "] and [". $self->SUPER::end. "]") 247 if $self->location_type eq 'IN-BETWEEN' && defined $self->SUPER::end && 248 ($self->SUPER::end - 1 == $self->SUPER::start); 249 250 return $self->SUPER::start(); 251} 252 253=head2 end 254 255 Title : end 256 Usage : $end = $fuzzy->end(); 257 Function: get/set end of this range, handling fuzzy_ends 258 Returns : a positive integer representing the end of the range 259 Args : end location on set (can be fuzzy string) 260 261=cut 262 263sub end { 264 my($self,$value) = @_; 265 if( defined $value ) { 266 my ($encode,$min,$max) = $self->_fuzzypointdecode($value); 267 $self->end_pos_type($encode); 268 $self->min_end($min); 269 $self->max_end($max); 270 } 271 272 $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [". 273 $self->SUPER::start. "] and [". $self->SUPER::end. "]") 274 if $self->location_type eq 'IN-BETWEEN' && defined $self->SUPER::start && 275 ($self->SUPER::end - 1 == $self->SUPER::start); 276 277 return $self->SUPER::end(); 278} 279 280=head2 min_start 281 282 Title : min_start 283 Usage : $min_start = $fuzzy->min_start(); 284 Function: get/set the minimum starting point 285 Returns : the minimum starting point from the contained sublocations 286 Args : integer or undef on set 287 288=cut 289 290sub min_start { 291 my ($self,@args) = @_; 292 293 if(@args) { 294 $self->{'_min_start'} = $args[0]; # the value may be undef! 295 } 296 return $self->{'_min_start'}; 297} 298 299=head2 max_start 300 301 Title : max_start 302 Usage : my $maxstart = $location->max_start(); 303 Function: Get/set maximum starting location of feature startpoint 304 Returns : integer or undef if no maximum starting point. 305 Args : integer or undef on set 306 307=cut 308 309sub max_start { 310 my ($self,@args) = @_; 311 312 if(@args) { 313 $self->{'_max_start'} = $args[0]; # the value may be undef! 314 } 315 return $self->{'_max_start'}; 316} 317 318=head2 start_pos_type 319 320 Title : start_pos_type 321 Usage : my $start_pos_type = $location->start_pos_type(); 322 Function: Get/set start position type. 323 Returns : type of position coded as text 324 ('BEFORE','AFTER','EXACT','WITHIN','BETWEEN','UNCERTAIN') 325 Args : a string on set 326 327=cut 328 329sub start_pos_type { 330 my ($self,$value) = @_; 331 if(defined $value && $value =~ /^\d+$/ ) { 332 if( $value == 0 ) { $value = 'EXACT'; } 333 else { 334 my $v = $LOCATIONCODESBSANE[$value]; 335 if( ! defined $v ) { 336 $self->warn("Provided value $value which I don't understand,". 337 " reverting to 'EXACT'"); 338 $v = 'EXACT'; 339 } 340 $value = $v; 341 } 342 } 343 if(defined($value)) { 344 $self->{'_start_pos_type'} = $value; 345 } 346 return $self->{'_start_pos_type'}; 347} 348 349=head2 min_end 350 351 Title : min_end 352 Usage : my $minend = $location->min_end(); 353 Function: Get/set minimum ending location of feature endpoint 354 Returns : integer or undef if no minimum ending point. 355 Args : integer or undef on set 356 357=cut 358 359sub min_end { 360 my ($self,@args) = @_; 361 362 if(@args) { 363 $self->{'_min_end'} = $args[0]; # the value may be undef! 364 } 365 return $self->{'_min_end'}; 366} 367 368=head2 max_end 369 370 Title : max_end 371 Usage : my $maxend = $location->max_end(); 372 Function: Get/set maximum ending location of feature endpoint 373 Returns : integer or undef if no maximum ending point. 374 Args : integer or undef on set 375 376=cut 377 378sub max_end { 379 my ($self,@args) = @_; 380 381 if(@args) { 382 $self->{'_max_end'} = $args[0]; # the value may be undef! 383 } 384 return $self->{'_max_end'}; 385} 386 387=head2 end_pos_type 388 389 Title : end_pos_type 390 Usage : my $end_pos_type = $location->end_pos_type(); 391 Function: Get/set end position type. 392 Returns : type of position coded as text 393 ('BEFORE','AFTER','EXACT','WITHIN','BETWEEN','UNCERTAIN') 394 Args : a string on set 395 396=cut 397 398sub end_pos_type { 399 my ($self,$value) = @_; 400 if( defined $value && $value =~ /^\d+$/ ) { 401 if( $value == 0 ) { $value = 'EXACT'; } 402 else { 403 my $v = $LOCATIONCODESBSANE[$value]; 404 if( ! defined $v ) { 405 $self->warn("Provided value $value which I don't understand,". 406 " reverting to 'EXACT'"); 407 $v = 'EXACT'; 408 } 409 $value = $v; 410 } 411 } 412 413 if(defined($value)) { 414 $self->{'_end_pos_type'} = $value; 415 } 416 return $self->{'_end_pos_type'}; 417} 418 419=head2 seq_id 420 421 Title : seq_id 422 Usage : my $seqid = $location->seq_id(); 423 Function: Get/Set seq_id that location refers to 424 Returns : seq_id 425 Args : [optional] seq_id value to set 426 427=cut 428 429=head2 coordinate_policy 430 431 Title : coordinate_policy 432 433 Usage : $policy = $location->coordinate_policy(); 434 $location->coordinate_policy($mypolicy); # set may not be possible 435 Function: Get the coordinate computing policy employed by this object. 436 437 See Bio::Location::CoordinatePolicyI for documentation about 438 the policy object and its use. 439 440 The interface *does not* require implementing classes to accept 441 setting of a different policy. The implementation provided here 442 does, however, allow one to do so. 443 444 Implementors of this interface are expected to initialize every 445 new instance with a CoordinatePolicyI object. The implementation 446 provided here will return a default policy object if none has 447 been set yet. To change this default policy object call this 448 method as a class method with an appropriate argument. Note that 449 in this case only subsequently created Location objects will be 450 affected. 451 452 Returns : A Bio::Location::CoordinatePolicyI implementing object. 453 Args : On set, a Bio::Location::CoordinatePolicyI implementing object. 454 455See L<Bio::Location::CoordinatePolicyI> 456 457=cut 458 459=head2 to_FTstring 460 461 Title : to_FTstring 462 Usage : my $locstr = $location->to_FTstring() 463 Function: Get/Set seq_id that location refers to 464 Returns : seq_id 465 Args : [optional] seq_id value to set 466 467=cut 468 469sub to_FTstring { 470 my ($self) = @_; 471 my (%vals) = ( 'start' => $self->start, 472 'min_start' => $self->min_start, 473 'max_start' => $self->max_start, 474 'start_code' => $self->start_pos_type, 475 'end' => $self->end, 476 'min_end' => $self->min_end, 477 'max_end' => $self->max_end, 478 'end_code' => $self->end_pos_type ); 479 480 my (%strs) = ( 'start' => '', 481 'end' => ''); 482 my ($delimiter) = $FUZZYCODES{$self->location_type}; 483 $delimiter = $FUZZYCODES{'EXACT'} if ($self->location_type eq 'UNCERTAIN'); 484 485 my $policy = ref($self->coordinate_policy); 486 487 # I'm lazy, lets do this in a loop since behaviour will be the same for 488 # start and end 489 # The CoordinatePolicy now dictates start/end data here (bug 992) - cjf 490 foreach my $point ( qw(start end) ) { 491 if( ($vals{$point."_code"} ne 'EXACT') && 492 ($vals{$point."_code"} ne 'UNCERTAIN') ) { 493 494 # must have max and min defined to use 'WITHIN', 'BETWEEN' 495 if ((!defined $vals{"min_$point"} || 496 !defined $vals{"max_$point"}) && 497 ( $vals{$point."_code"} eq 'WITHIN' || 498 $vals{$point."_code"} eq 'BETWEEN')) 499 { 500 $vals{"min_$point"} = '' unless defined $vals{"min_$point"}; 501 $vals{"max_$point"} = '' unless defined $vals{"max_$point"}; 502 503 $self->warn("Fuzzy codes for start are in a strange state, (". 504 join(",", ($vals{"min_$point"}, 505 $vals{"max_$point"}, 506 $vals{$point."_code"})). ")"); 507 return ''; 508 } 509 510 if (defined $vals{$point."_code"} && 511 ($vals{$point."_code"} eq 'BEFORE' || 512 $vals{$point."_code"} eq 'AFTER')) 513 { 514 $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}}; 515 $strs{$point} .= $vals{"$point"}; 516 } 517 518 if( defined $vals{$point."_code"} && 519 ($vals{$point."_code"} eq 'WITHIN' || 520 $vals{$point."_code"} eq 'BETWEEN')) 521 { 522 # Expect odd results with anything but WidestCoordPolicy for now 523 $strs{$point} .= ($point eq 'start') ? 524 $vals{"$point"}. 525 $FUZZYCODES{$vals{$point."_code"}}. 526 $vals{'max_'.$point} 527 : 528 $vals{'min_'.$point}. 529 $FUZZYCODES{$vals{$point."_code"}}. 530 $vals{"$point"}; 531 $strs{$point} = "(".$strs{$point}.")"; 532 } 533 534 } elsif ($vals{$point."_code"} eq 'UNCERTAIN') { 535 $strs{$point} = $FUZZYCODES{$vals{$point."_code"}}; 536 $strs{$point} .= $vals{$point} if defined $vals{$point}; 537 } else { 538 $strs{$point} = $vals{$point}; 539 } 540 } 541 542 my $str = $strs{'start'} . $delimiter . $strs{'end'}; 543 if($self->is_remote() && $self->seq_id()) { 544 $str = $self->seq_id() . ":" . $str; 545 } 546 if( defined $self->strand && 547 $self->strand == -1 && 548 $self->location_type() ne "UNCERTAIN") { 549 $str = "complement(" . $str . ")"; 550 } elsif($self->location_type() eq "WITHIN") { 551 $str = "(".$str.")"; 552 } 553 return $str; 554} 555 556=head2 valid_Location 557 558 Title : valid_Location 559 Usage : if ($location->valid_location) {...}; 560 Function: boolean method to determine whether location is considered valid 561 (has minimum requirements for Simple implementation) 562 Returns : Boolean value: true if location is valid, false otherwise 563 Args : none 564 565=cut 566 567=head2 _fuzzypointdecode 568 569 Title : _fuzzypointdecode 570 Usage : ($type,$min,$max) = $self->_fuzzypointdecode('<5'); 571 Function: Decode a fuzzy string. 572 Returns : A 3-element array consisting of the type of location, the 573 minimum integer, and the maximum integer describing the range 574 of coordinates this start or endpoint refers to. Minimum or 575 maximum coordinate may be undefined. 576 : Returns empty array on fail. 577 Args : fuzzypoint string 578 579=cut 580 581sub _fuzzypointdecode { 582 my ($self, $string) = @_; 583 return () if( !defined $string); 584 # strip off leading and trailing space 585 $string =~ s/^\s*(\S+)\s*/$1/; 586 foreach my $pattern ( keys %FUZZYPOINTENCODE ) { 587 if( $string =~ /^$pattern$/ ) { 588 my ($min,$max) = ($1,$2) unless (($1 eq '') && (!defined $2)); 589 if( ($FUZZYPOINTENCODE{$pattern} eq 'EXACT') || 590 ($FUZZYPOINTENCODE{$pattern} eq 'UNCERTAIN') 591 ) { 592 $max = $min; 593 } else { 594 $max = undef if((defined $max) && (length($max) == 0)); 595 $min = undef if((defined $min) && (length($min) == 0)); 596 } 597 return ($FUZZYPOINTENCODE{$pattern},$min,$max); 598 } 599 } 600 if( $self->verbose >= 1 ) { 601 $self->warn("could not find a valid fuzzy encoding for $string"); 602 } 603 return (); 604} 605 6061; 607