1#
2# BioPerl module for Bio::Location::Fuzzy
3# Please direct questions and support issues to <bioperl-l@bioperl.org>
4#
5# Cared for by Jason Stajich <jason@bioperl.org>
6#
7# Copyright Jason Stajich
8#
9# You may distribute this module under the same terms as perl itself
10# POD documentation - main docs before the code
11
12=head1 NAME
13
14Bio::Location::Fuzzy - Implementation of a Location on a Sequence
15which has unclear start and/or end locations
16
17=head1 SYNOPSIS
18
19    use Bio::Location::Fuzzy;
20    my $fuzzylocation = Bio::Location::Fuzzy->new(
21                                                 -start => '<30',
22                                                 -end   => 90,
23                                                 -location_type => '..');
24
25    print "location string is ", $fuzzylocation->to_FTstring(), "\n";
26    print "location is of the type ", $fuzzylocation->location_type, "\n";
27
28=head1 DESCRIPTION
29
30This module contains the necessary methods for representing a
31Fuzzy Location, one that does not have clear start and/or end points.
32This will initially serve to handle features from Genbank/EMBL feature
33tables that are written as 1^100 meaning between bases 1 and 100 or
34E<lt>100..300 meaning it starts somewhere before 100.  Advanced
35implementations of this interface may be able to handle the necessary
36logic of overlaps/intersection/contains/union.  It was constructed to
37handle fuzzy locations that can be represented in Genbank/EMBL and
38Swissprot.
39
40=head1 FEEDBACK
41
42User feedback is an integral part of the evolution of this and other
43Bioperl modules. Send your comments and suggestions preferably to one
44of the Bioperl mailing lists.  Your participation is much appreciated.
45
46  bioperl-l@bioperl.org                  - General discussion
47  http://bioperl.org/wiki/Mailing_lists  - About the mailing lists
48
49=head2 Support
50
51Please direct usage questions or support issues to the mailing list:
52
53I<bioperl-l@bioperl.org>
54
55rather than to the module maintainer directly. Many experienced and
56reponsive experts will be able look at the problem and quickly
57address it. Please include a thorough description of the problem
58with code and data examples if at all possible.
59
60=head2 Reporting Bugs
61
62Report bugs to the Bioperl bug tracking system to help us keep track
63the bugs and their resolution.  Bug reports can be submitted via the
64web:
65
66  https://github.com/bioperl/bioperl-live/issues
67
68=head1 AUTHOR - Jason Stajich
69
70Email jason-at-bioperl-dot-org
71
72=head1 APPENDIX
73
74The rest of the documentation details each of the object
75methods. Internal methods are usually preceded with a _
76
77=cut
78
79# Let the code begin...
80
81package Bio::Location::Fuzzy;
82$Bio::Location::Fuzzy::VERSION = '1.7.7';
83use strict;
84
85use base qw(Bio::Location::Atomic Bio::Location::FuzzyLocationI);
86
87our @LOCATIONCODESBSANE = (undef, 'EXACT', 'WITHIN', 'BETWEEN', 'UNCERTAIN',
88            'BEFORE', 'AFTER');
89
90our %FUZZYCODES = ( 'EXACT' => '..', # Position is 'exact
91   # Exact position is unknown, but is within the range specified, ((1.2)..100)
92            'WITHIN' => '.',
93            # 1^2
94            'BETWEEN'    => '^',
95            'IN-BETWEEN' => '^',
96            'UNCERTAIN'  => '?',
97            # <100
98            'BEFORE'  => '<',
99            # >10
100            'AFTER'   => '>');
101
102    # The following regular expressions map to fuzzy location types. Every
103    # expression must match the complete encoded point string, and must
104    # contain two groups identifying min and max. Empty matches are automatic.
105    # converted to undef, except for 'EXACT', for which max is set to equal
106    # min.
107
108our %FUZZYPOINTENCODE = (
109    '\>(\d+)(.{0})' => 'AFTER',
110    '\<(.{0})(\d+)' => 'BEFORE',
111    '(\d+)'         => 'EXACT',
112    '\?(\d*)'       => 'UNCERTAIN',
113    '(\d+)(.{0})\>' => 'AFTER',
114    '(.{0})(\d+)\<' => 'BEFORE',
115    '(\d+)\.(\d+)'  => 'WITHIN',
116    '(\d+)\^(\d+)'  => 'BETWEEN',
117   );
118
119our %FUZZYRANGEENCODE  = (  '\.'   => 'WITHIN',
120                            '\.\.' => 'EXACT',
121                            '\^'   => 'IN-BETWEEN' );
122
123=head2 new
124
125 Title   : new
126 Usage   : my $fuzzyloc = Bio::Location::Fuzzy->new( @args);
127 Function:
128 Returns :
129 Args    : -start    => value for start  (initialize by superclass)
130           -end      => value for end    (initialize by superclass)
131           -strand   => value for strand (initialize by superclass)
132           -location_type => either ('EXACT','WITHIN','IN-BETWEEN',
133                             'UNCERTAIN') OR ( 1,2,3,4)
134           -start_ext=> extension for start - defaults to 0,
135           -start_fuz=  fuzzy code for start can be
136                      ('EXACT','WITHIN','BETWEEN','BEFORE','AFTER',
137                       'UNCERTAIN' ) OR
138                      a value 1 - 5 corresponding to index+1 above
139           -end_ext=> extension for end - defaults to 0,
140           -end_fuz=  fuzzy code for end can be
141                      ('EXACT','WITHIN','BETWEEN','BEFORE','AFTER',
142                       'UNCERTAIN') OR
143                      a value 1 - 5 corresponding to index+1 above
144
145=cut
146
147sub new {
148    my ($class, @args) = @_;
149    my $self = $class->SUPER::new(@args);
150    my ($location_type, $start_ext, $start_fuz, $end_ext, $end_fuz) =
151        $self->_rearrange([ qw(LOCATION_TYPE START_EXT START_FUZ
152                   END_EXT END_FUZ )
153                ], @args);
154
155    $location_type  && $self->location_type($location_type);
156    $start_ext && $self->max_start($self->min_start + $start_ext);
157    $end_ext   && $self->max_end($self->min_end + $end_ext);
158    $start_fuz && $self->start_pos_type($start_fuz);
159    $end_fuz   && $self->end_pos_type($end_fuz);
160
161    return $self;
162}
163
164=head2 location_type
165
166  Title   : location_type
167  Usage   : my $location_type = $location->location_type();
168  Function: Get location type encoded as text
169  Returns : string ('EXACT', 'WITHIN', 'IN-BETWEEN', 'UNCERTAIN')
170  Args    : none
171
172=cut
173
174sub location_type {
175    my ($self,$value) = @_;
176    if( defined $value || ! defined $self->{'_location_type'} ) {
177        $value = 'EXACT' unless defined $value;
178        if(! defined $FUZZYCODES{$value} )  {
179            $value = uc($value);
180            if( $value =~ /\.\./ ) {
181                $value = 'EXACT';
182            } elsif( $value =~ /^\.$/ ) {
183                $value = 'WITHIN';
184            } elsif( $value =~ /\^/ ) {
185                $value = 'IN-BETWEEN';
186                $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [".
187                             $self->start. "] and [". $self->end. "]")
188                  if defined $self->start && defined $self->end &&
189                            ($self->end - 1 == $self->start);
190            } elsif( $value =~ /\?/ ) {
191                $value = 'UNCERTAIN';
192            } elsif( $value ne 'EXACT' && $value ne 'WITHIN' &&
193                        $value ne 'IN-BETWEEN' ) {
194                $self->throw("Did not specify a valid location type");
195            }
196        }
197        $self->{'_location_type'} = $value;
198    }
199    return $self->{'_location_type'};
200}
201
202=head1 LocationI methods
203
204=head2 length
205
206  Title   : length
207  Usage   : $length = $fuzzy_loc->length();
208  Function: Get the length of this location.
209
210            Note that the length of a fuzzy location will always depend
211            on the currently active interpretation of start and end. The
212            result will therefore vary for different CoordinatePolicy objects.
213
214  Returns : an integer
215  Args    : none
216
217=cut
218
219#sub length {
220#    my($self) = @_;
221#    return $self->SUPER::length() if( !$self->start || !$self->end);
222#    $self->warn('Length is not valid for a FuzzyLocation');
223#    return 0;
224#}
225
226=head2 start
227
228  Title   : start
229  Usage   : $start = $fuzzy->start();
230  Function: get/set start of this range, handling fuzzy_starts
231  Returns : a positive integer representing the start of the location
232  Args    : start location on set (can be fuzzy point string)
233
234=cut
235
236sub start {
237    my($self,$value) = @_;
238    if( defined $value ) {
239    my ($encode,$min,$max) = $self->_fuzzypointdecode($value);
240    $self->start_pos_type($encode);
241    $self->min_start($min);
242    $self->max_start($max);
243    }
244
245    $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations ["
246                 . $self->SUPER::start. "] and [". $self->SUPER::end. "]")
247    if $self->location_type eq 'IN-BETWEEN'  && defined $self->SUPER::end &&
248                  ($self->SUPER::end - 1 == $self->SUPER::start);
249
250    return $self->SUPER::start();
251}
252
253=head2 end
254
255  Title   : end
256  Usage   : $end = $fuzzy->end();
257  Function: get/set end of this range, handling fuzzy_ends
258  Returns : a positive integer representing the end of the range
259  Args    : end location on set (can be fuzzy string)
260
261=cut
262
263sub end {
264    my($self,$value) = @_;
265    if( defined $value ) {
266    my ($encode,$min,$max) = $self->_fuzzypointdecode($value);
267    $self->end_pos_type($encode);
268    $self->min_end($min);
269    $self->max_end($max);
270    }
271
272    $self->throw("Use Bio::Location::Simple for IN-BETWEEN locations [".
273                 $self->SUPER::start. "] and [". $self->SUPER::end. "]")
274    if $self->location_type eq 'IN-BETWEEN' && defined $self->SUPER::start &&
275                ($self->SUPER::end - 1 == $self->SUPER::start);
276
277    return $self->SUPER::end();
278}
279
280=head2 min_start
281
282  Title   : min_start
283  Usage   : $min_start = $fuzzy->min_start();
284  Function: get/set the minimum starting point
285  Returns : the minimum starting point from the contained sublocations
286  Args    : integer or undef on set
287
288=cut
289
290sub min_start {
291    my ($self,@args) = @_;
292
293    if(@args) {
294    $self->{'_min_start'} = $args[0]; # the value may be undef!
295    }
296    return $self->{'_min_start'};
297}
298
299=head2 max_start
300
301  Title   : max_start
302  Usage   : my $maxstart = $location->max_start();
303  Function: Get/set maximum starting location of feature startpoint
304  Returns : integer or undef if no maximum starting point.
305  Args    : integer or undef on set
306
307=cut
308
309sub max_start {
310    my ($self,@args) = @_;
311
312    if(@args) {
313        $self->{'_max_start'} = $args[0]; # the value may be undef!
314    }
315    return $self->{'_max_start'};
316}
317
318=head2 start_pos_type
319
320  Title   : start_pos_type
321  Usage   : my $start_pos_type = $location->start_pos_type();
322  Function: Get/set start position type.
323  Returns : type of position coded as text
324            ('BEFORE','AFTER','EXACT','WITHIN','BETWEEN','UNCERTAIN')
325  Args    : a string on set
326
327=cut
328
329sub start_pos_type {
330    my ($self,$value) = @_;
331    if(defined $value &&  $value =~ /^\d+$/ ) {
332        if( $value == 0 ) { $value = 'EXACT'; }
333        else {
334            my $v = $LOCATIONCODESBSANE[$value];
335            if( ! defined $v ) {
336                $self->warn("Provided value $value which I don't understand,".
337                            " reverting to 'EXACT'");
338                $v = 'EXACT';
339            }
340            $value = $v;
341        }
342    }
343    if(defined($value)) {
344        $self->{'_start_pos_type'} = $value;
345    }
346    return $self->{'_start_pos_type'};
347}
348
349=head2 min_end
350
351  Title   : min_end
352  Usage   : my $minend = $location->min_end();
353  Function: Get/set minimum ending location of feature endpoint
354  Returns : integer or undef if no minimum ending point.
355  Args    : integer or undef on set
356
357=cut
358
359sub min_end {
360    my ($self,@args) = @_;
361
362    if(@args) {
363        $self->{'_min_end'} = $args[0]; # the value may be undef!
364    }
365    return $self->{'_min_end'};
366}
367
368=head2 max_end
369
370  Title   : max_end
371  Usage   : my $maxend = $location->max_end();
372  Function: Get/set maximum ending location of feature endpoint
373  Returns : integer or undef if no maximum ending point.
374  Args    : integer or undef on set
375
376=cut
377
378sub max_end {
379    my ($self,@args) = @_;
380
381    if(@args) {
382        $self->{'_max_end'} = $args[0]; # the value may be undef!
383    }
384    return $self->{'_max_end'};
385}
386
387=head2 end_pos_type
388
389  Title   : end_pos_type
390  Usage   : my $end_pos_type = $location->end_pos_type();
391  Function: Get/set end position type.
392  Returns : type of position coded as text
393            ('BEFORE','AFTER','EXACT','WITHIN','BETWEEN','UNCERTAIN')
394  Args    : a string on set
395
396=cut
397
398sub end_pos_type {
399    my ($self,$value) = @_;
400    if( defined $value && $value =~ /^\d+$/ ) {
401        if( $value == 0 ) { $value = 'EXACT'; }
402        else {
403            my $v = $LOCATIONCODESBSANE[$value];
404            if( ! defined $v ) {
405                $self->warn("Provided value $value which I don't understand,".
406                            " reverting to 'EXACT'");
407                $v = 'EXACT';
408            }
409            $value = $v;
410        }
411    }
412
413    if(defined($value)) {
414        $self->{'_end_pos_type'} = $value;
415    }
416    return $self->{'_end_pos_type'};
417}
418
419=head2 seq_id
420
421  Title   : seq_id
422  Usage   : my $seqid = $location->seq_id();
423  Function: Get/Set seq_id that location refers to
424  Returns : seq_id
425  Args    : [optional] seq_id value to set
426
427=cut
428
429=head2 coordinate_policy
430
431  Title   : coordinate_policy
432
433  Usage   : $policy = $location->coordinate_policy();
434            $location->coordinate_policy($mypolicy); # set may not be possible
435  Function: Get the coordinate computing policy employed by this object.
436
437            See Bio::Location::CoordinatePolicyI for documentation about
438            the policy object and its use.
439
440            The interface *does not* require implementing classes to accept
441            setting of a different policy. The implementation provided here
442            does, however, allow one to do so.
443
444            Implementors of this interface are expected to initialize every
445            new instance with a CoordinatePolicyI object. The implementation
446            provided here will return a default policy object if none has
447            been set yet. To change this default policy object call this
448            method as a class method with an appropriate argument. Note that
449            in this case only subsequently created Location objects will be
450            affected.
451
452  Returns : A Bio::Location::CoordinatePolicyI implementing object.
453  Args    : On set, a Bio::Location::CoordinatePolicyI implementing object.
454
455See L<Bio::Location::CoordinatePolicyI>
456
457=cut
458
459=head2 to_FTstring
460
461  Title   : to_FTstring
462  Usage   : my $locstr = $location->to_FTstring()
463  Function: Get/Set seq_id that location refers to
464  Returns : seq_id
465  Args    : [optional] seq_id value to set
466
467=cut
468
469sub to_FTstring {
470    my ($self) = @_;
471    my (%vals) = ( 'start' => $self->start,
472           'min_start' => $self->min_start,
473           'max_start' => $self->max_start,
474           'start_code' => $self->start_pos_type,
475           'end' => $self->end,
476           'min_end' => $self->min_end,
477           'max_end' => $self->max_end,
478           'end_code' => $self->end_pos_type );
479
480    my (%strs) = ( 'start' => '',
481           'end'   => '');
482    my ($delimiter) = $FUZZYCODES{$self->location_type};
483    $delimiter = $FUZZYCODES{'EXACT'} if ($self->location_type eq 'UNCERTAIN');
484
485    my $policy = ref($self->coordinate_policy);
486
487    # I'm lazy, lets do this in a loop since behaviour will be the same for
488    # start and end
489    # The CoordinatePolicy now dictates start/end data here (bug 992) - cjf
490    foreach my $point ( qw(start end) ) {
491        if( ($vals{$point."_code"} ne 'EXACT') &&
492            ($vals{$point."_code"} ne 'UNCERTAIN') ) {
493
494            # must have max and min defined to use 'WITHIN', 'BETWEEN'
495            if ((!defined $vals{"min_$point"} ||
496                 !defined $vals{"max_$point"}) &&
497                ( $vals{$point."_code"} eq 'WITHIN' ||
498                  $vals{$point."_code"} eq 'BETWEEN'))
499            {
500                $vals{"min_$point"} = '' unless defined $vals{"min_$point"};
501                $vals{"max_$point"} = '' unless defined $vals{"max_$point"};
502
503                $self->warn("Fuzzy codes for start are in a strange state, (".
504                        join(",", ($vals{"min_$point"},
505                               $vals{"max_$point"},
506                               $vals{$point."_code"})). ")");
507                return '';
508            }
509
510            if (defined $vals{$point."_code"} &&
511               ($vals{$point."_code"} eq 'BEFORE' ||
512                $vals{$point."_code"} eq 'AFTER'))
513            {
514                $strs{$point} .= $FUZZYCODES{$vals{$point."_code"}};
515                $strs{$point} .= $vals{"$point"};
516            }
517
518            if( defined $vals{$point."_code"} &&
519              ($vals{$point."_code"} eq 'WITHIN' ||
520               $vals{$point."_code"} eq 'BETWEEN'))
521            {
522                # Expect odd results with anything but WidestCoordPolicy for now
523                $strs{$point} .= ($point eq 'start') ?
524                        $vals{"$point"}.
525                        $FUZZYCODES{$vals{$point."_code"}}.
526                        $vals{'max_'.$point}
527                        :
528                        $vals{'min_'.$point}.
529                        $FUZZYCODES{$vals{$point."_code"}}.
530                        $vals{"$point"};
531                $strs{$point} = "(".$strs{$point}.")";
532            }
533
534        } elsif ($vals{$point."_code"} eq 'UNCERTAIN') {
535            $strs{$point}  = $FUZZYCODES{$vals{$point."_code"}};
536            $strs{$point} .= $vals{$point} if defined $vals{$point};
537        } else {
538            $strs{$point} = $vals{$point};
539        }
540    }
541
542    my $str = $strs{'start'} . $delimiter . $strs{'end'};
543    if($self->is_remote() && $self->seq_id()) {
544    $str = $self->seq_id() . ":" . $str;
545    }
546    if( defined $self->strand &&
547    $self->strand == -1 &&
548    $self->location_type() ne "UNCERTAIN") {
549    $str = "complement(" . $str . ")";
550    } elsif($self->location_type() eq "WITHIN") {
551    $str = "(".$str.")";
552    }
553    return $str;
554}
555
556=head2 valid_Location
557
558 Title   : valid_Location
559 Usage   : if ($location->valid_location) {...};
560 Function: boolean method to determine whether location is considered valid
561           (has minimum requirements for Simple implementation)
562 Returns : Boolean value: true if location is valid, false otherwise
563 Args    : none
564
565=cut
566
567=head2 _fuzzypointdecode
568
569  Title   : _fuzzypointdecode
570  Usage   : ($type,$min,$max) = $self->_fuzzypointdecode('<5');
571  Function: Decode a fuzzy string.
572  Returns : A 3-element array consisting of the type of location, the
573            minimum integer, and the maximum integer describing the range
574            of coordinates this start or endpoint refers to. Minimum or
575            maximum coordinate may be undefined.
576          : Returns empty array on fail.
577  Args    : fuzzypoint string
578
579=cut
580
581sub _fuzzypointdecode {
582    my ($self, $string) = @_;
583    return () if( !defined $string);
584    # strip off leading and trailing space
585    $string =~ s/^\s*(\S+)\s*/$1/;
586    foreach my $pattern ( keys %FUZZYPOINTENCODE ) {
587        if( $string =~ /^$pattern$/ ) {
588            my ($min,$max) = ($1,$2) unless (($1 eq '') && (!defined $2));
589            if( ($FUZZYPOINTENCODE{$pattern} eq 'EXACT') ||
590                 ($FUZZYPOINTENCODE{$pattern} eq 'UNCERTAIN')
591              ) {
592                $max = $min;
593            } else {
594                $max = undef if((defined $max) && (length($max) == 0));
595                $min = undef if((defined $min) && (length($min) == 0));
596            }
597            return ($FUZZYPOINTENCODE{$pattern},$min,$max);
598        }
599    }
600    if( $self->verbose >= 1 ) {
601        $self->warn("could not find a valid fuzzy encoding for $string");
602    }
603    return ();
604}
605
6061;
607