1#################################################################
2# Functions.pm - internal functions for reading, parsing, arrays
3#################################################################
4# Original version thanks to Tom Hladish
5#
6# $Id: Functions.pm,v 1.16 2012/02/07 21:49:27 astoltzfus Exp $
7
8#################### START POD DOCUMENTATION ##################
9
10=head1 NAME
11
12Bio::NEXUS::Functions - Provides private utiliy functions for the module
13
14=head1 SYNOPSIS
15
16=head1 DESCRIPTION
17
18This package provides private functions that are not object-specific.
19
20=head1 COMMENTS
21
22=head1 FEEDBACK
23
24All feedback (bugs, feature enhancements, etc.) is greatly appreciated.
25
26=head1 AUTHORS
27
28 Original version by Thomas Hladish (tjhladish at yahoo)
29
30=head1 VERSION
31
32$Revision: 1.16 $
33
34=head1 METHODS
35
36=cut
37
38package Bio::NEXUS::Functions;
39
40use strict;
41#use Data::Dumper; # XXX this is not used, might as well not import it!
42#use Carp; # XXX this is not used, might as well not import it!
43use Bio::NEXUS::Util::Exceptions;
44use vars qw(@EXPORT @EXPORT_OK @ISA $VERSION);
45use Bio::NEXUS; $VERSION = $Bio::NEXUS::VERSION;
46use Exporter ();
47
48@ISA    = qw ( Exporter );
49@EXPORT = qw(
50    &_slurp
51    &_parse_nexus_words
52    &_ntsa
53    &_stna
54    &_quote_if_needed
55    &_nexus_formatted
56    &_is_comment
57    &_is_number
58    &_is_dec_number
59    &_sci_to_dec
60    &_unique
61    &_nonunique
62    &_share_elements
63    &_fast_in_array
64    &_in_array
65    &_is_same_array
66);
67
68## READING & PARSING FUNCTIONS:
69
70=begin comment
71
72 Name    : _slurp
73 Usage   : $file_content = _slurp($filename);
74 Function: reads an entire file into memory
75 Returns : none
76 Args    : file name (string)
77
78=end comment
79
80=cut
81
82sub _slurp {
83    my ($filename) = @_;
84    open my $fh, '<', "$filename"
85        || Bio::NEXUS::Util::Exceptions::FileError->throw(
86    	'error' => "ERROR: Could not open filename <$filename> for input; $!"
87    );
88    my $file_content = do { local ($/); <$fh> };
89    return $file_content;
90}
91
92=begin comment
93
94 Title   : _parse_nexus_words
95 Usage   : $parsed_words = _parse_nexus_words($buffer);
96 Function: parse a string of text into "words" (as defined in the NEXUS standard)
97 Returns : an array ref of "words" and punctuation marks.  Single-quoted expressions are single "words".  Double quotes are not supported.
98 Args    : text buffer
99 Notes   : this method has replaced _parse_string_tokens(), which did not conform to the NEXUS standard in all its quirky splendor (particularly with regard to punctuation)
100
101=end comment
102
103=cut
104
105sub _parse_nexus_words {
106    my $buffer = shift;
107    if ( not defined $buffer ) {
108	    Bio::NEXUS::Util::Exceptions::BadArgs->throw(
109    		'error' => '_parse_nexus_words() requires a text string argument (the text to be parsed)'
110    	);
111    }
112    my @words;
113    my ( $word, $in_quotes ) = ( q{}, 0 );
114
115    my @chars         = split( //, $buffer );
116    my $comment_level = 0;
117
118    # iterate through the characters
119    for ( my $i = 0; $i < @chars; $i++ ) {
120        my $char = $chars[$i];
121        my $next = $chars[ $i + 1 ];
122
123        if ($comment_level) {  # if we are in a comment already
124            $comment_level++ if ( $char eq '[' );
125            $comment_level-- if ( $char eq ']' );
126            $word .= $char;
127        }
128
129        # If we see a quote
130        elsif ( $char eq q{'} ) {
131
132            # and we're already inside quotes . . .
133            if ($in_quotes) {
134
135                # check to see if this is an escaped (doubled single) quote,
136                # (unless we're already at the end of the string to be parsed).
137                if ( defined $next && $next eq q{'} ) {
138
139                    # If it is, append it to the current word;
140                    $word .= $char;
141                }
142                else {
143
144                    # otherwise, close off the quoted string
145                    $in_quotes--;
146
147                    # Replace spaces with underscores (according to NEXUS, they're equivalent)
148                    #
149                    # This may not be correct.  Certainly TreeBASE doesn't like it
150                    # when we use both quoted strings and underscores in them
151                    $word =~ s/ /_/g;
152
153                    # Push it onto the word list, after
154                    # dealing with funny apostrophe business
155                    push @words, _ntsa($word);
156
157                    # And clean the slate
158                    $word = q{};
159                }
160            }
161            else {
162
163                # If we weren't in quotes before, we are now
164                $in_quotes++;
165            }
166        }
167        elsif ($in_quotes) {
168
169            # We're in a quoted string, so anything can be part of the word
170            $word .= $char;
171        }
172        elsif ( $char eq '[' ) {  # hit new comment, level 0 (bug if we just finished one)
173            $comment_level++;
174            $word .= $char;
175        }
176
177        # If we see NEXUS-style punctuation
178        elsif ( $char =~ /[\[\]\-(){}\/\\,;:=*"`+<>]/ ) {
179
180            push @words, &_ntsa($word)
181
182                # $word will be q{} if there was a preceding space;
183                # otherwise, it will contain some string
184                unless $word eq q{};
185
186			# then that counts as a word (we'll deal with pos/neg
187			# numbers later in _rebuild_numbers() if that gets called)
188            push @words, $char;
189            $word = q{};
190        }
191
192        # If we see whitespace
193        elsif ( $char =~ /\s/ ) {
194
195            # then we just finished a [probably] normal, space-delimited word
196            push @words, &_ntsa($word)
197
198                unless $word eq q{};
199
200            # although we don't want to keep pushing it
201            # if there are multiple spaces, so we empty $word
202            $word = q{};
203        }
204
205        # If $word isn't quoted, and $char is neither punctuation nor whitespace
206        else {
207            $word .= $char;
208        }
209    }
210
211    push @words, $word unless $word eq q{};
212    return \@words;
213}
214
215sub _rebuild_numbers {
216    my $words = shift;
217    my @new_words;
218
219    # Don't bother checking whether the last word is a '+' or '-'
220    for ( my $i = 0; $i < ( @$words - 1 ); $i++ ) {
221        my $word = $words->[$i];
222        my $next = $words->[ $i + 1 ];    # There will always be a next
223
224        #        my $next_next = defined $words[$i +2] ? $words[$i+2] : q{};
225        # There might be a previous
226        my $last = $i == 0 ? undef: $words->[ $i - 1 ];
227
228        if ( $word eq '-' || $word eq '+' ) {
229            if ( my ( $num, $exp ) = $next =~ /^([\d.]+)(e)?/i ) {
230                if ( _is_dec_number($num) ) {
231                    $word .= $next;
232                    $i++;
233                    if ($exp) {
234
235                    }
236                }
237            }
238        }
239        else {
240            push @new_words, $word;
241        }
242    }
243    return \@new_words;
244}
245
246=begin comment
247
248 Title   : _ntsa (nexus to standard apostrophe)
249 Usage   : $standard_word = $block->_ntsa($nexus_word);
250 Function: change doubled single quotes to single single quotes (apostrophes)
251 Returns : a standard english word (or phrase)
252 Args    : a nexus "word"
253 Notes   : See NEXUS definition of "word" for an explanation
254
255=end comment
256
257=cut
258
259sub _ntsa {
260    my $nexus_word = shift;
261    $nexus_word =~ s/[^']''[^']/'/g;
262    return $nexus_word;
263}
264
265=begin comment
266
267 Title   : _stna (standard to nexus apostrophe)
268 Usage   : $nexus_word = $block->_stna($standard_word);
269 Function: change single single quotes (apostrophes) to double single quotes
270 Returns : a nexus "word"
271 Args    : a standard english word (or phrase)
272 Notes   : See NEXUS definition of "word" for an explanation
273
274=end comment
275
276=cut
277
278sub _stna {
279    my $standard_word = shift;
280    $standard_word =~ s/[^']'[^']/''/g;
281    return $standard_word;
282}
283
284=begin comment
285
286 Title   : _quote_if_needed
287 Usage   : $string = Bio::NEXUS::Block::_quote_if_needed($string);
288 Function: put single quotes around string if it contains spaces or NEXUS punctuation
289 Returns : a string, in single quotes if necessary
290 Args    : a string
291
292=end comment
293
294=cut
295
296sub _quote_if_needed {
297    my $nexus_word = shift;
298    if ( $nexus_word =~ /[-\s(){}\[\]\/\\,;:=+*<>`'"]/ ) {
299        return "'$nexus_word'";
300    }
301    else {
302        return $nexus_word;
303    }
304}
305
306=begin comment
307
308 Title   : _nexus_formatted
309 Usage   : $string = Bio::NEXUS::Block::_nexus_formatted($string);
310 Function: escape apostrophes and quote strings as needed for NEXUS output
311 Returns : a string
312 Args    : a string
313
314=end comment
315
316=cut
317
318sub _nexus_formatted {
319    my $nexus_word = shift;
320    $nexus_word = _quote_if_needed( _stna($nexus_word) );
321    return $nexus_word;
322}
323
324=begin comment
325
326 Name    : _is_comment
327 Usage   : $boolean = _is_comment($string);
328 Function: tests whether something looks like a comment
329 Returns : boolean
330 Args    : string to test
331
332=end comment
333
334=cut
335
336sub _is_comment {
337    my ($string) = @_;
338    if ( $string =~ /^\[.*\]$/s ) { return 1 }
339    else { return 0 }
340}
341
342=begin comment
343
344 Title   : _is_dec_number
345 Usage   : if ( _is_dec_number($num) ) { do_something() };
346 Function: verifies that a number is a normal decimal number (e.g. 3 or 9.41)
347 Returns : 1 if $num is a number, otherwise 0
348 Args    : a number
349
350=end comment
351
352=cut
353
354sub _is_dec_number {
355    my ($number) = @_;
356
357    return 0 unless defined $number && length $number;
358
359    my $number_regex = qr/^[-+]?                 # positive or negative
360                                (?: \d+          # e.g., 523
361                                 | \d*[.]\d+     # 3.14 or .45
362                                 | \d+[.]\d*     # 212. or 212.0
363                                )
364                        $/x;
365
366    return 0 unless defined $number && $number =~ $number_regex;
367
368    return 1;
369}
370
371=begin comment
372
373 Title   : _is_number
374 Usage   : if ( _is_number($num) ) { do_something() };
375 Function: verifies that a number is of reasonable form (such as 0.4 or 6.1e2.1)
376 Returns : 1 if $num is a number, otherwise 0
377 Args    : a number
378
379=end comment
380
381=cut
382
383sub _is_number {
384    my ($number) = @_;
385
386    return 0 unless defined $number && length $number;
387
388    my ( $num, $exp ) = $number =~ /^([^e]+)(?:e([^e]+))?$/i;
389
390    return 0 unless _is_dec_number($num);
391
392    return _is_dec_number($exp) if defined $exp;
393
394    return 1;
395}
396
397=begin comment
398
399 Title   : _sci_to_dec
400 Usage   : $decimal = _sci_to_dec($scientic_notation);
401 Function: Changes scientific notation to decimal notation
402 Returns : scalar (a number)
403 Args    : scalar (a number), possibly in scientific notation
404
405=end comment
406
407=cut
408
409sub _sci_to_dec {
410    my ($sci_num) = @_;
411
412    $sci_num =~ s/\s//g;
413    return $sci_num if _is_dec_number($sci_num);
414
415    my ( $num, $exp ) = $sci_num =~ /^ ([^e]+) e ([^e]+) $/ix;
416
417    return 0 unless ( _is_dec_number($num) && _is_dec_number($exp) );
418
419    my $dec_num = $num * ( 10**$exp );
420    return $dec_num;
421}
422
423## ARRAY FUNCTIONS:
424
425=begin comment
426
427 Name    : _any
428 Usage   :  _any($filename);
429 Function: reads an entire file into memory
430 Returns : none
431 Args    : file name (string)
432
433=end comment
434
435=cut
436
437sub _unique {
438    my (@array) = @_;
439    my %seen = ();
440
441    # from perl cookbook.  fast, and preserves order
442    my @unique = grep { !$seen{$_}++ } @array;
443    return @unique;
444}
445
446sub _nonunique {
447    my (@array) = @_;
448    my %seen = ();
449    my @nonunique = grep { $seen{$_}++ } @array;
450    return @nonunique;
451}
452
453sub _share_elements {
454    my ( $array1, $array2 ) = @_;
455    for my $element1 (@$array1) {
456        if ( &in_array( $array2, $element1 ) ) { return 1; }
457    }
458    return 0;
459}
460
461sub _fast_in_array {
462    my ( $array, $element ) = @_;
463    for (@$array) {
464        if ( $element eq $_ ) {
465            return 1;
466        }
467    }
468    return 0;
469}
470
471sub _in_array {
472    my ( $array, $test ) = @_;
473    my $match = 0;
474    for (@$array) {
475        $match++ if $_ eq $test;
476    }
477    return $match;
478}
479
480sub _is_same_array {
481    my ( $array, $test ) = @_;
482    return 1 if $array eq $test;
483    return 0 unless scalar @$array == scalar @$test;
484
485    my $astr = join '', sort @$array;
486    my $tstr = join '', sort @$test;
487    return 1 if $astr eq $tstr;
488    return 0;
489}
490
4911;
492