1package Sort::Fields; 2 3use strict; 4use vars qw($VERSION @EXPORT); 5 6use Exporter qw(import); 7require 5.003_03; 8; 9# Items to export into callers namespace by default. Note: do not export 10# names by default without a very good reason. Use EXPORT_OK instead. 11# Do not simply export all your public functions/methods/constants. 12@EXPORT = qw( 13 make_fieldsort 14 fieldsort 15 make_stable_fieldsort 16 stable_fieldsort 17); 18$VERSION = '1.001'; 19 20use Carp; 21 22sub make_fieldsort { 23 my $selfname; 24 if ((caller)[0] eq 'Sort::Fields') { 25 ($selfname) = (caller 1)[3] =~ /([^:]*)$/; 26 } else { 27 $selfname = 'make_fieldsort' 28 }; 29 unless (@_) { 30 croak "$selfname requires argument(s)"; 31 } 32 33 my ($sep, $cols); 34 if (ref $_[0]) { 35 $sep = '\\s+' 36 } else { 37 $sep = shift; 38 } 39 unless (ref($cols = shift) eq 'ARRAY') { 40 croak "$selfname field specifiers must be in anon array"; 41 } 42 my (@sortcode, @col); 43 my $level = 1; 44 my $maxcol = -1; 45 my $stable = 0; 46 if (@$cols and $$cols[0] eq '-') { 47 shift @$cols; 48 $stable = 1; 49 } 50 unless (@$cols) { 51 croak "$selfname must have at least one field specifier"; 52 } 53 for (@$cols) { 54 unless (/^-?\d+n?$/) { 55 croak "improperly formatted $selfname column specifier '$_'"; 56 } 57 my ($a, $b) = /^-/ ? qw(b a) : qw(a b); 58 my $op = /n$/ ? '<=>' : 'cmp'; 59 my ($col) = /^-?(\d+)/; 60 if ($col == 0) { # column 0 gives the entire string 61 push @sortcode, "\$${a}->[0] $op \$${b}->[0]"; 62 next; 63 } 64 push @col, (/(\d+)/)[0] - 1; 65 $maxcol = $col[-1] if $maxcol < $col[-1]; 66 if ($stable) { 67 # indices are offset by 1 in this case 68 my $levp1 = $level + 1; 69 push @sortcode, "\$${a}->[$levp1] $op \$${b}->[$levp1]"; 70 } else { 71 push @sortcode, "\$${a}->[$level] $op \$${b}->[$level]"; 72 } 73 $level++; 74 } 75 # have to check this all by itself, since if there's a regex 76 # error it won't show up until the sub is called (urk!) 77 eval '"" =~ /$sep/'; 78 if ($@) { 79 croak "probable regexp error in $selfname arg: /$sep/\n$@"; 80 } 81 my $splitfunc; 82 $splitfunc = eval 'sub { (split /$sep/o, $_, $maxcol + 2)[@col] } '; 83 if ($@) { 84 die "eval failed in $selfname (internal error?)\n$@"; 85 } 86 my $sortcode = join " or ", @sortcode; 87 my $sub; 88 if ($stable) { 89 my $i; # the $i for the stable sort closure 90 $sub = eval qq{ 91 sub { 92 if (\$^W and not wantarray) { 93 carp "fieldsort called in scalar or void context"; 94 } 95 \$i = 0; # reset counter in case reusing this closure 96 map \$_->[0], 97 sort { $sortcode or \$a->[1] <=> \$b->[1] } 98 map [\$_, \$i++, \$splitfunc->(\$_)], 99 \@_; 100 } 101 } 102 } else { 103 $sub = eval qq{ 104 sub { 105 if (\$^W and not wantarray) { 106 carp "fieldsort called in scalar or void context"; 107 } 108 map \$_->[0], 109 sort { $sortcode } 110 map [\$_, \$splitfunc->(\$_)], 111 \@_; 112 } 113 } 114 } 115 if ($@) { 116 die "eval failed in $selfname (internal error?)\n$@"; 117 } 118 $sub; 119} 120 121sub make_stable_fieldsort { 122 unless (@_) { 123 croak "make_stable_fieldsort requires argument(s)"; 124 } 125 if (ref $_[0] eq 'ARRAY') { 126 unshift @{$_[0]}, '-'; 127 } elsif (@_ > 1 and ref $_[1] eq 'ARRAY') { 128 unshift @{$_[1]}, '-'; 129 } 130 make_fieldsort @_; 131} 132 133sub fieldsort { 134 unless (@_) { 135 croak "fieldsort requires argument(s)"; 136 } 137 my ($sep, $cols); 138 if (ref $_[0]) { 139 $sep = '\\s+' 140 } else { 141 $sep = shift; 142 } 143 $cols = shift; 144 make_fieldsort($sep, $cols)->(@_); 145} 146 147sub stable_fieldsort { 148 unless (@_) { 149 croak "stable_fieldsort requires argument(s)"; 150 } 151 my ($sep, $cols); 152 if (ref $_[0] eq 'ARRAY') { 153 $sep = '\\s+'; 154 unshift @{$_[0]}, '-'; 155 } elsif (@_ > 1 and ref $_[1] eq 'ARRAY') { 156 $sep = shift; 157 unshift @{$_[1]}, '-'; 158 } 159 $cols = shift; 160 make_fieldsort($sep, $cols)->(@_); 161} 162 163 1641; 165__END__ 166 167=encoding utf8 168 169=head1 NAME 170 171Sort::Fields - Sort lines containing delimited fields 172 173=head1 SYNOPSIS 174 175 use Sort::Fields; 176 @sorted = fieldsort [3, '2n'], @lines; 177 @sorted = fieldsort '\+', [-1, -3, 0], @lines; 178 179 $sort_3_2n = make_fieldsort [3, '2n'], @lines; 180 @sorted = $sort_3_2n->(@lines); 181 182=head1 DESCRIPTION 183 184Sort::Fields provides a general purpose technique for efficiently sorting 185lists of lines that contain data separated into fields. 186 187Sort::Fields automatically imports two subroutines, C<fieldsort> and 188C<make_fieldsort>, and two variants, C<stable_fieldsort> and 189C<make_stable_fieldsort>. C<make_fieldsort> generates a sorting subroutine 190and returns a reference to it. C<fieldsort> is a wrapper for 191the C<make_fieldsort> subroutine. 192 193The first argument to make_fieldsort is a delimiter string, which is 194used as a regular expression argument for a C<split> operator. The 195delimiter string is optional. If it is not supplied, make_fieldsort 196splits each line using C</\s+/>. 197 198The second argument is an array reference containing one or more 199field specifiers. The specifiers indicate what fields in the strings 200will be used to sort the data. The specifier "1" indicates the first 201field, "2" indicates the second, and so on. A negative specifier 202like "-2" means to sort on the second field in reverse (descending) 203order. To indicate a numeric rather than alphabetic comparison, 204append "n" to the specifier. A specifier of "0" means the entire 205string ("-0" means the entire string, in reverse order). 206 207The order in which the specifiers appear is the order in which they 208will be used to sort the data. The primary key is first, the secondary 209key is second, and so on. 210 211C<fieldsort [1, 2], @data> is roughly equivalent to 212C<make_fieldsort([1, 2])-E<gt>(@data)>. Avoid calling fieldsort repeatedly 213with the same sort specifiers. If you need to use a particular 214sort more than once, it is more efficient to call C<make_fieldsort> 215once and reuse the subroutine it returns. 216 217C<stable_fieldsort> and C<make_stable_fieldsort> are like their 218"unstable" counterparts, except that the items that compare the same 219are maintained in their original order. 220 221=head1 EXAMPLES 222 223Some sample data (in array C<@data>): 224 225 123 asd 1.22 asdd 226 32 ewq 2.32 asdd 227 43 rewq 2.12 ewet 228 51 erwt 34.2 ewet 229 23 erww 4.21 ewet 230 91 fdgs 3.43 ewet 231 123 refs 3.22 asdd 232 123 refs 4.32 asdd 233 234 # alpha sort on column 1 235 print fieldsort [1], @data; 236 237 123 asd 1.22 asdd 238 123 refs 3.22 asdd 239 123 refs 4.32 asdd 240 23 erww 4.21 ewet 241 32 ewq 2.32 asdd 242 43 rewq 2.12 ewet 243 51 erwt 34.2 ewet 244 91 fdgs 3.43 ewet 245 246 # numeric sort on column 1 247 print fieldsort ['1n'], @data; 248 249 23 erww 4.21 ewet 250 32 ewq 2.32 asdd 251 43 rewq 2.12 ewet 252 51 erwt 34.2 ewet 253 91 fdgs 3.43 ewet 254 123 asd 1.22 asdd 255 123 refs 3.22 asdd 256 123 refs 4.32 asdd 257 258 # reverse numeric sort on column 1 259 print fieldsort ['-1n'], @data; 260 261 123 asd 1.22 asdd 262 123 refs 3.22 asdd 263 123 refs 4.32 asdd 264 91 fdgs 3.43 ewet 265 51 erwt 34.2 ewet 266 43 rewq 2.12 ewet 267 32 ewq 2.32 asdd 268 23 erww 4.21 ewet 269 270 # alpha sort on column 2, then alpha on entire line 271 print fieldsort [2, 0], @data; 272 273 123 asd 1.22 asdd 274 51 erwt 34.2 ewet 275 23 erww 4.21 ewet 276 32 ewq 2.32 asdd 277 91 fdgs 3.43 ewet 278 123 refs 3.22 asdd 279 123 refs 4.32 asdd 280 43 rewq 2.12 ewet 281 282 # alpha sort on column 4, then numeric on column 1, then reverse 283 # numeric on column 3 284 print fieldsort [4, '1n', '-3n'], @data; 285 286 32 ewq 2.32 asdd 287 123 refs 4.32 asdd 288 123 refs 3.22 asdd 289 123 asd 1.22 asdd 290 23 erww 4.21 ewet 291 43 rewq 2.12 ewet 292 51 erwt 34.2 ewet 293 91 fdgs 3.43 ewet 294 295 # now, splitting on either literal period or whitespace 296 # sort numeric on column 4 (fractional part of decimals) then 297 # numeric on column 3 (whole part of decimals) 298 print fieldsort '(?:\.|\s+)', ['4n', '3n'], @data; 299 300 51 erwt 34.2 ewet 301 43 rewq 2.12 ewet 302 23 erww 4.21 ewet 303 123 asd 1.22 asdd 304 123 refs 3.22 asdd 305 32 ewq 2.32 asdd 306 123 refs 4.32 asdd 307 91 fdgs 3.43 ewet 308 309 # alpha sort on column 4, then numeric on the entire line 310 # NOTE: produces warnings under -w 311 print fieldsort [4, '0n'], @data; 312 313 32 ewq 2.32 asdd 314 123 asd 1.22 asdd 315 123 refs 3.22 asdd 316 123 refs 4.32 asdd 317 23 erww 4.21 ewet 318 43 rewq 2.12 ewet 319 51 erwt 34.2 ewet 320 91 fdgs 3.43 ewet 321 322 # stable alpha sort on column 4 (maintains original relative order 323 # among items that compare the same) 324 print stable_fieldsort [4], @data; 325 326 123 asd 1.22 asdd 327 32 ewq 2.32 asdd 328 123 refs 3.22 asdd 329 123 refs 4.32 asdd 330 43 rewq 2.12 ewet 331 51 erwt 34.2 ewet 332 23 erww 4.21 ewet 333 91 fdgs 3.43 ewet 334 335=head1 BUGS 336 337Some rudimentary tests now. 338 339Perhaps something should be done to catch things like: 340 341 fieldsort '.', [1, 2], @lines; 342 343C<'.'> translates to C<split /./> -- probably not what you want. 344 345Passing blank lines and/or lines containing the wrong kind of 346data (alphas instead of numbers) can result in copious warning messages 347under C<-w>. 348 349If the regexp contains memory parentheses (C<(...)> rather than C<(?:...)>), 350split will function in "delimiter retention" mode, capturing the 351contents of the parentheses as well as the stuff between the delimiters. 352I could imagine how this could be useful, but on the other hand I 353could also imagine how it could be confusing if encountered unexpectedly. 354Caveat sortor. 355 356Not really a bug, but if you are planning to sort a large text file, 357consider using sort(1). Unless, of course, your operating system 358doesn't have sort(1). 359 360=head1 AUTHOR 361 362Joseph N. Hall, C<< <joseph@5sigma.com> >> 363 364=head1 SEE ALSO 365 366perl(1). 367 368=cut 369