1################################################################# 2# Functions.pm - internal functions for reading, parsing, arrays 3################################################################# 4# Original version thanks to Tom Hladish 5# 6# $Id: Functions.pm,v 1.16 2012/02/07 21:49:27 astoltzfus Exp $ 7 8#################### START POD DOCUMENTATION ################## 9 10=head1 NAME 11 12Bio::NEXUS::Functions - Provides private utiliy functions for the module 13 14=head1 SYNOPSIS 15 16=head1 DESCRIPTION 17 18This package provides private functions that are not object-specific. 19 20=head1 COMMENTS 21 22=head1 FEEDBACK 23 24All feedback (bugs, feature enhancements, etc.) is greatly appreciated. 25 26=head1 AUTHORS 27 28 Original version by Thomas Hladish (tjhladish at yahoo) 29 30=head1 VERSION 31 32$Revision: 1.16 $ 33 34=head1 METHODS 35 36=cut 37 38package Bio::NEXUS::Functions; 39 40use strict; 41#use Data::Dumper; # XXX this is not used, might as well not import it! 42#use Carp; # XXX this is not used, might as well not import it! 43use Bio::NEXUS::Util::Exceptions; 44use vars qw(@EXPORT @EXPORT_OK @ISA $VERSION); 45use Bio::NEXUS; $VERSION = $Bio::NEXUS::VERSION; 46use Exporter (); 47 48@ISA = qw ( Exporter ); 49@EXPORT = qw( 50 &_slurp 51 &_parse_nexus_words 52 &_ntsa 53 &_stna 54 &_quote_if_needed 55 &_nexus_formatted 56 &_is_comment 57 &_is_number 58 &_is_dec_number 59 &_sci_to_dec 60 &_unique 61 &_nonunique 62 &_share_elements 63 &_fast_in_array 64 &_in_array 65 &_is_same_array 66); 67 68## READING & PARSING FUNCTIONS: 69 70=begin comment 71 72 Name : _slurp 73 Usage : $file_content = _slurp($filename); 74 Function: reads an entire file into memory 75 Returns : none 76 Args : file name (string) 77 78=end comment 79 80=cut 81 82sub _slurp { 83 my ($filename) = @_; 84 open my $fh, '<', "$filename" 85 || Bio::NEXUS::Util::Exceptions::FileError->throw( 86 'error' => "ERROR: Could not open filename <$filename> for input; $!" 87 ); 88 my $file_content = do { local ($/); <$fh> }; 89 return $file_content; 90} 91 92=begin comment 93 94 Title : _parse_nexus_words 95 Usage : $parsed_words = _parse_nexus_words($buffer); 96 Function: parse a string of text into "words" (as defined in the NEXUS standard) 97 Returns : an array ref of "words" and punctuation marks. Single-quoted expressions are single "words". Double quotes are not supported. 98 Args : text buffer 99 Notes : this method has replaced _parse_string_tokens(), which did not conform to the NEXUS standard in all its quirky splendor (particularly with regard to punctuation) 100 101=end comment 102 103=cut 104 105sub _parse_nexus_words { 106 my $buffer = shift; 107 if ( not defined $buffer ) { 108 Bio::NEXUS::Util::Exceptions::BadArgs->throw( 109 'error' => '_parse_nexus_words() requires a text string argument (the text to be parsed)' 110 ); 111 } 112 my @words; 113 my ( $word, $in_quotes ) = ( q{}, 0 ); 114 115 my @chars = split( //, $buffer ); 116 my $comment_level = 0; 117 118 # iterate through the characters 119 for ( my $i = 0; $i < @chars; $i++ ) { 120 my $char = $chars[$i]; 121 my $next = $chars[ $i + 1 ]; 122 123 if ($comment_level) { # if we are in a comment already 124 $comment_level++ if ( $char eq '[' ); 125 $comment_level-- if ( $char eq ']' ); 126 $word .= $char; 127 } 128 129 # If we see a quote 130 elsif ( $char eq q{'} ) { 131 132 # and we're already inside quotes . . . 133 if ($in_quotes) { 134 135 # check to see if this is an escaped (doubled single) quote, 136 # (unless we're already at the end of the string to be parsed). 137 if ( defined $next && $next eq q{'} ) { 138 139 # If it is, append it to the current word; 140 $word .= $char; 141 } 142 else { 143 144 # otherwise, close off the quoted string 145 $in_quotes--; 146 147 # Replace spaces with underscores (according to NEXUS, they're equivalent) 148 # 149 # This may not be correct. Certainly TreeBASE doesn't like it 150 # when we use both quoted strings and underscores in them 151 $word =~ s/ /_/g; 152 153 # Push it onto the word list, after 154 # dealing with funny apostrophe business 155 push @words, _ntsa($word); 156 157 # And clean the slate 158 $word = q{}; 159 } 160 } 161 else { 162 163 # If we weren't in quotes before, we are now 164 $in_quotes++; 165 } 166 } 167 elsif ($in_quotes) { 168 169 # We're in a quoted string, so anything can be part of the word 170 $word .= $char; 171 } 172 elsif ( $char eq '[' ) { # hit new comment, level 0 (bug if we just finished one) 173 $comment_level++; 174 $word .= $char; 175 } 176 177 # If we see NEXUS-style punctuation 178 elsif ( $char =~ /[\[\]\-(){}\/\\,;:=*"`+<>]/ ) { 179 180 push @words, &_ntsa($word) 181 182 # $word will be q{} if there was a preceding space; 183 # otherwise, it will contain some string 184 unless $word eq q{}; 185 186 # then that counts as a word (we'll deal with pos/neg 187 # numbers later in _rebuild_numbers() if that gets called) 188 push @words, $char; 189 $word = q{}; 190 } 191 192 # If we see whitespace 193 elsif ( $char =~ /\s/ ) { 194 195 # then we just finished a [probably] normal, space-delimited word 196 push @words, &_ntsa($word) 197 198 unless $word eq q{}; 199 200 # although we don't want to keep pushing it 201 # if there are multiple spaces, so we empty $word 202 $word = q{}; 203 } 204 205 # If $word isn't quoted, and $char is neither punctuation nor whitespace 206 else { 207 $word .= $char; 208 } 209 } 210 211 push @words, $word unless $word eq q{}; 212 return \@words; 213} 214 215sub _rebuild_numbers { 216 my $words = shift; 217 my @new_words; 218 219 # Don't bother checking whether the last word is a '+' or '-' 220 for ( my $i = 0; $i < ( @$words - 1 ); $i++ ) { 221 my $word = $words->[$i]; 222 my $next = $words->[ $i + 1 ]; # There will always be a next 223 224 # my $next_next = defined $words[$i +2] ? $words[$i+2] : q{}; 225 # There might be a previous 226 my $last = $i == 0 ? undef: $words->[ $i - 1 ]; 227 228 if ( $word eq '-' || $word eq '+' ) { 229 if ( my ( $num, $exp ) = $next =~ /^([\d.]+)(e)?/i ) { 230 if ( _is_dec_number($num) ) { 231 $word .= $next; 232 $i++; 233 if ($exp) { 234 235 } 236 } 237 } 238 } 239 else { 240 push @new_words, $word; 241 } 242 } 243 return \@new_words; 244} 245 246=begin comment 247 248 Title : _ntsa (nexus to standard apostrophe) 249 Usage : $standard_word = $block->_ntsa($nexus_word); 250 Function: change doubled single quotes to single single quotes (apostrophes) 251 Returns : a standard english word (or phrase) 252 Args : a nexus "word" 253 Notes : See NEXUS definition of "word" for an explanation 254 255=end comment 256 257=cut 258 259sub _ntsa { 260 my $nexus_word = shift; 261 $nexus_word =~ s/[^']''[^']/'/g; 262 return $nexus_word; 263} 264 265=begin comment 266 267 Title : _stna (standard to nexus apostrophe) 268 Usage : $nexus_word = $block->_stna($standard_word); 269 Function: change single single quotes (apostrophes) to double single quotes 270 Returns : a nexus "word" 271 Args : a standard english word (or phrase) 272 Notes : See NEXUS definition of "word" for an explanation 273 274=end comment 275 276=cut 277 278sub _stna { 279 my $standard_word = shift; 280 $standard_word =~ s/[^']'[^']/''/g; 281 return $standard_word; 282} 283 284=begin comment 285 286 Title : _quote_if_needed 287 Usage : $string = Bio::NEXUS::Block::_quote_if_needed($string); 288 Function: put single quotes around string if it contains spaces or NEXUS punctuation 289 Returns : a string, in single quotes if necessary 290 Args : a string 291 292=end comment 293 294=cut 295 296sub _quote_if_needed { 297 my $nexus_word = shift; 298 if ( $nexus_word =~ /[-\s(){}\[\]\/\\,;:=+*<>`'"]/ ) { 299 return "'$nexus_word'"; 300 } 301 else { 302 return $nexus_word; 303 } 304} 305 306=begin comment 307 308 Title : _nexus_formatted 309 Usage : $string = Bio::NEXUS::Block::_nexus_formatted($string); 310 Function: escape apostrophes and quote strings as needed for NEXUS output 311 Returns : a string 312 Args : a string 313 314=end comment 315 316=cut 317 318sub _nexus_formatted { 319 my $nexus_word = shift; 320 $nexus_word = _quote_if_needed( _stna($nexus_word) ); 321 return $nexus_word; 322} 323 324=begin comment 325 326 Name : _is_comment 327 Usage : $boolean = _is_comment($string); 328 Function: tests whether something looks like a comment 329 Returns : boolean 330 Args : string to test 331 332=end comment 333 334=cut 335 336sub _is_comment { 337 my ($string) = @_; 338 if ( $string =~ /^\[.*\]$/s ) { return 1 } 339 else { return 0 } 340} 341 342=begin comment 343 344 Title : _is_dec_number 345 Usage : if ( _is_dec_number($num) ) { do_something() }; 346 Function: verifies that a number is a normal decimal number (e.g. 3 or 9.41) 347 Returns : 1 if $num is a number, otherwise 0 348 Args : a number 349 350=end comment 351 352=cut 353 354sub _is_dec_number { 355 my ($number) = @_; 356 357 return 0 unless defined $number && length $number; 358 359 my $number_regex = qr/^[-+]? # positive or negative 360 (?: \d+ # e.g., 523 361 | \d*[.]\d+ # 3.14 or .45 362 | \d+[.]\d* # 212. or 212.0 363 ) 364 $/x; 365 366 return 0 unless defined $number && $number =~ $number_regex; 367 368 return 1; 369} 370 371=begin comment 372 373 Title : _is_number 374 Usage : if ( _is_number($num) ) { do_something() }; 375 Function: verifies that a number is of reasonable form (such as 0.4 or 6.1e2.1) 376 Returns : 1 if $num is a number, otherwise 0 377 Args : a number 378 379=end comment 380 381=cut 382 383sub _is_number { 384 my ($number) = @_; 385 386 return 0 unless defined $number && length $number; 387 388 my ( $num, $exp ) = $number =~ /^([^e]+)(?:e([^e]+))?$/i; 389 390 return 0 unless _is_dec_number($num); 391 392 return _is_dec_number($exp) if defined $exp; 393 394 return 1; 395} 396 397=begin comment 398 399 Title : _sci_to_dec 400 Usage : $decimal = _sci_to_dec($scientic_notation); 401 Function: Changes scientific notation to decimal notation 402 Returns : scalar (a number) 403 Args : scalar (a number), possibly in scientific notation 404 405=end comment 406 407=cut 408 409sub _sci_to_dec { 410 my ($sci_num) = @_; 411 412 $sci_num =~ s/\s//g; 413 return $sci_num if _is_dec_number($sci_num); 414 415 my ( $num, $exp ) = $sci_num =~ /^ ([^e]+) e ([^e]+) $/ix; 416 417 return 0 unless ( _is_dec_number($num) && _is_dec_number($exp) ); 418 419 my $dec_num = $num * ( 10**$exp ); 420 return $dec_num; 421} 422 423## ARRAY FUNCTIONS: 424 425=begin comment 426 427 Name : _any 428 Usage : _any($filename); 429 Function: reads an entire file into memory 430 Returns : none 431 Args : file name (string) 432 433=end comment 434 435=cut 436 437sub _unique { 438 my (@array) = @_; 439 my %seen = (); 440 441 # from perl cookbook. fast, and preserves order 442 my @unique = grep { !$seen{$_}++ } @array; 443 return @unique; 444} 445 446sub _nonunique { 447 my (@array) = @_; 448 my %seen = (); 449 my @nonunique = grep { $seen{$_}++ } @array; 450 return @nonunique; 451} 452 453sub _share_elements { 454 my ( $array1, $array2 ) = @_; 455 for my $element1 (@$array1) { 456 if ( &in_array( $array2, $element1 ) ) { return 1; } 457 } 458 return 0; 459} 460 461sub _fast_in_array { 462 my ( $array, $element ) = @_; 463 for (@$array) { 464 if ( $element eq $_ ) { 465 return 1; 466 } 467 } 468 return 0; 469} 470 471sub _in_array { 472 my ( $array, $test ) = @_; 473 my $match = 0; 474 for (@$array) { 475 $match++ if $_ eq $test; 476 } 477 return $match; 478} 479 480sub _is_same_array { 481 my ( $array, $test ) = @_; 482 return 1 if $array eq $test; 483 return 0 unless scalar @$array == scalar @$test; 484 485 my $astr = join '', sort @$array; 486 my $tstr = join '', sort @$test; 487 return 1 if $astr eq $tstr; 488 return 0; 489} 490 4911; 492