1package Sort::Fields;
2
3use strict;
4use vars qw($VERSION @EXPORT);
5
6use Exporter qw(import);
7require 5.003_03;
8;
9# Items to export into callers namespace by default. Note: do not export
10# names by default without a very good reason. Use EXPORT_OK instead.
11# Do not simply export all your public functions/methods/constants.
12@EXPORT = qw(
13	make_fieldsort
14	fieldsort
15	make_stable_fieldsort
16	stable_fieldsort
17);
18$VERSION = '1.001';
19
20use Carp;
21
22sub make_fieldsort {
23	my $selfname;
24	if ((caller)[0] eq 'Sort::Fields') {
25		($selfname) = (caller 1)[3] =~ /([^:]*)$/;
26	} else {
27		$selfname = 'make_fieldsort'
28	};
29	unless (@_) {
30		croak "$selfname requires argument(s)";
31	}
32
33	my ($sep, $cols);
34	if (ref $_[0]) {
35		$sep = '\\s+'
36	} else {
37		$sep = shift;
38	}
39	unless (ref($cols = shift) eq 'ARRAY') {
40		croak "$selfname field specifiers must be in anon array";
41	}
42	my (@sortcode, @col);
43	my $level = 1;
44	my $maxcol = -1;
45	my $stable = 0;
46	if (@$cols and $$cols[0] eq '-') {
47		shift @$cols;
48		$stable = 1;
49	}
50	unless (@$cols) {
51		croak "$selfname must have at least one field specifier";
52	}
53	for (@$cols) {
54		unless (/^-?\d+n?$/) {
55			croak "improperly formatted $selfname column specifier '$_'";
56		}
57		my ($a, $b) = /^-/ ? qw(b a) : qw(a b);
58		my $op = /n$/ ? '<=>' : 'cmp';
59		my ($col) = /^-?(\d+)/;
60		if ($col == 0) {  # column 0 gives the entire string
61			push @sortcode, "\$${a}->[0] $op \$${b}->[0]";
62			next;
63		}
64		push @col, (/(\d+)/)[0] - 1;
65		$maxcol = $col[-1] if $maxcol < $col[-1];
66		if ($stable) {
67			# indices are offset by 1 in this case
68			my $levp1 = $level + 1;
69			push @sortcode, "\$${a}->[$levp1] $op \$${b}->[$levp1]";
70		} else {
71			push @sortcode, "\$${a}->[$level] $op \$${b}->[$level]";
72		}
73		$level++;
74	}
75	# have to check this all by itself, since if there's a regex
76    # error it won't show up until the sub is called (urk!)
77	eval '"" =~ /$sep/';
78	if ($@) {
79		croak "probable regexp error in $selfname arg: /$sep/\n$@";
80	}
81	my $splitfunc;
82	$splitfunc = eval 'sub { (split /$sep/o, $_, $maxcol + 2)[@col] } ';
83	if ($@) {
84		die "eval failed in $selfname (internal error?)\n$@";
85	}
86	my $sortcode = join " or ", @sortcode;
87	my $sub;
88	if ($stable) {
89		my $i;  # the $i for the stable sort closure
90		$sub = eval qq{
91			sub {
92				if (\$^W and not wantarray) {
93					carp "fieldsort called in scalar or void context";
94				}
95				\$i = 0;  # reset counter in case reusing this closure
96				map \$_->[0],
97					sort { $sortcode or \$a->[1] <=> \$b->[1] }
98					map [\$_, \$i++, \$splitfunc->(\$_)],
99					\@_;
100			}
101		}
102	} else {
103		$sub = eval qq{
104			sub {
105				if (\$^W and not wantarray) {
106					carp "fieldsort called in scalar or void context";
107				}
108				map \$_->[0],
109					sort { $sortcode }
110					map [\$_, \$splitfunc->(\$_)],
111					\@_;
112			}
113		}
114	}
115	if ($@) {
116		die "eval failed in $selfname (internal error?)\n$@";
117	}
118	$sub;
119}
120
121sub make_stable_fieldsort {
122	unless (@_) {
123		croak "make_stable_fieldsort requires argument(s)";
124	}
125	if (ref $_[0] eq 'ARRAY') {
126		unshift @{$_[0]}, '-';
127	} elsif (@_ > 1 and ref $_[1] eq 'ARRAY') {
128		unshift @{$_[1]}, '-';
129	}
130	make_fieldsort @_;
131}
132
133sub fieldsort {
134	unless (@_) {
135		croak "fieldsort requires argument(s)";
136	}
137	my ($sep, $cols);
138	if (ref $_[0]) {
139		$sep = '\\s+'
140	} else {
141		$sep = shift;
142	}
143	$cols = shift;
144	make_fieldsort($sep, $cols)->(@_);
145}
146
147sub stable_fieldsort {
148	unless (@_) {
149		croak "stable_fieldsort requires argument(s)";
150	}
151	my ($sep, $cols);
152	if (ref $_[0] eq 'ARRAY') {
153		$sep = '\\s+';
154		unshift @{$_[0]}, '-';
155	} elsif (@_ > 1 and ref $_[1] eq 'ARRAY') {
156		$sep = shift;
157		unshift @{$_[1]}, '-';
158	}
159	$cols = shift;
160	make_fieldsort($sep, $cols)->(@_);
161}
162
163
1641;
165__END__
166
167=encoding utf8
168
169=head1 NAME
170
171Sort::Fields - Sort lines containing delimited fields
172
173=head1 SYNOPSIS
174
175  use Sort::Fields;
176  @sorted = fieldsort [3, '2n'], @lines;
177  @sorted = fieldsort '\+', [-1, -3, 0], @lines;
178
179  $sort_3_2n = make_fieldsort [3, '2n'], @lines;
180  @sorted = $sort_3_2n->(@lines);
181
182=head1 DESCRIPTION
183
184Sort::Fields provides a general purpose technique for efficiently sorting
185lists of lines that contain data separated into fields.
186
187Sort::Fields automatically imports two subroutines, C<fieldsort> and
188C<make_fieldsort>, and two variants, C<stable_fieldsort> and
189C<make_stable_fieldsort>.  C<make_fieldsort> generates a sorting subroutine
190and returns a reference to it.  C<fieldsort> is a wrapper for
191the C<make_fieldsort> subroutine.
192
193The first argument to make_fieldsort is a delimiter string, which is
194used as a regular expression argument for a C<split> operator.  The
195delimiter string is optional.  If it is not supplied, make_fieldsort
196splits each line using C</\s+/>.
197
198The second argument is an array reference containing one or more
199field specifiers.  The specifiers indicate what fields in the strings
200will be used to sort the data.  The specifier "1" indicates the first
201field, "2" indicates the second, and so on.  A negative specifier
202like "-2" means to sort on the second field in reverse (descending)
203order.  To indicate a numeric rather than alphabetic comparison,
204append "n" to the specifier.  A specifier of "0" means the entire
205string ("-0" means the entire string, in reverse order).
206
207The order in which the specifiers appear is the order in which they
208will be used to sort the data.  The primary key is first, the secondary
209key is second, and so on.
210
211C<fieldsort [1, 2], @data> is roughly equivalent to
212C<make_fieldsort([1, 2])-E<gt>(@data)>.  Avoid calling fieldsort repeatedly
213with the same sort specifiers.  If you need to use a particular
214sort more than once, it is more efficient to call C<make_fieldsort>
215once and reuse the subroutine it returns.
216
217C<stable_fieldsort> and C<make_stable_fieldsort> are like their
218"unstable" counterparts, except that the items that compare the same
219are maintained in their original order.
220
221=head1 EXAMPLES
222
223Some sample data (in array C<@data>):
224
225  123   asd   1.22   asdd
226  32    ewq   2.32   asdd
227  43    rewq  2.12   ewet
228  51    erwt  34.2   ewet
229  23    erww  4.21   ewet
230  91    fdgs  3.43   ewet
231  123   refs  3.22   asdd
232  123   refs  4.32   asdd
233
234  # alpha sort on column 1
235  print fieldsort [1], @data;
236
237  123   asd   1.22   asdd
238  123   refs  3.22   asdd
239  123   refs  4.32   asdd
240  23    erww  4.21   ewet
241  32    ewq   2.32   asdd
242  43    rewq  2.12   ewet
243  51    erwt  34.2   ewet
244  91    fdgs  3.43   ewet
245
246  # numeric sort on column 1
247  print fieldsort ['1n'], @data;
248
249  23    erww  4.21   ewet
250  32    ewq   2.32   asdd
251  43    rewq  2.12   ewet
252  51    erwt  34.2   ewet
253  91    fdgs  3.43   ewet
254  123   asd   1.22   asdd
255  123   refs  3.22   asdd
256  123   refs  4.32   asdd
257
258  # reverse numeric sort on column 1
259  print fieldsort ['-1n'], @data;
260
261  123   asd   1.22   asdd
262  123   refs  3.22   asdd
263  123   refs  4.32   asdd
264  91    fdgs  3.43   ewet
265  51    erwt  34.2   ewet
266  43    rewq  2.12   ewet
267  32    ewq   2.32   asdd
268  23    erww  4.21   ewet
269
270  # alpha sort on column 2, then alpha on entire line
271  print fieldsort [2, 0], @data;
272
273  123   asd   1.22   asdd
274  51    erwt  34.2   ewet
275  23    erww  4.21   ewet
276  32    ewq   2.32   asdd
277  91    fdgs  3.43   ewet
278  123   refs  3.22   asdd
279  123   refs  4.32   asdd
280  43    rewq  2.12   ewet
281
282  # alpha sort on column 4, then numeric on column 1, then reverse
283  # numeric on column 3
284  print fieldsort [4, '1n', '-3n'], @data;
285
286  32    ewq   2.32   asdd
287  123   refs  4.32   asdd
288  123   refs  3.22   asdd
289  123   asd   1.22   asdd
290  23    erww  4.21   ewet
291  43    rewq  2.12   ewet
292  51    erwt  34.2   ewet
293  91    fdgs  3.43   ewet
294
295  # now, splitting on either literal period or whitespace
296  # sort numeric on column 4 (fractional part of decimals) then
297  # numeric on column 3 (whole part of decimals)
298  print fieldsort '(?:\.|\s+)', ['4n', '3n'], @data;
299
300  51    erwt  34.2   ewet
301  43    rewq  2.12   ewet
302  23    erww  4.21   ewet
303  123   asd   1.22   asdd
304  123   refs  3.22   asdd
305  32    ewq   2.32   asdd
306  123   refs  4.32   asdd
307  91    fdgs  3.43   ewet
308
309  # alpha sort on column 4, then numeric on the entire line
310  # NOTE: produces warnings under -w
311  print fieldsort [4, '0n'], @data;
312
313  32    ewq   2.32   asdd
314  123   asd   1.22   asdd
315  123   refs  3.22   asdd
316  123   refs  4.32   asdd
317  23    erww  4.21   ewet
318  43    rewq  2.12   ewet
319  51    erwt  34.2   ewet
320  91    fdgs  3.43   ewet
321
322  # stable alpha sort on column 4 (maintains original relative order
323  # among items that compare the same)
324  print stable_fieldsort [4], @data;
325
326  123   asd   1.22   asdd
327  32    ewq   2.32   asdd
328  123   refs  3.22   asdd
329  123   refs  4.32   asdd
330  43    rewq  2.12   ewet
331  51    erwt  34.2   ewet
332  23    erww  4.21   ewet
333  91    fdgs  3.43   ewet
334
335=head1 BUGS
336
337Some rudimentary tests now.
338
339Perhaps something should be done to catch things like:
340
341  fieldsort '.', [1, 2], @lines;
342
343C<'.'> translates to C<split /./> -- probably not what you want.
344
345Passing blank lines and/or lines containing the wrong kind of
346data (alphas instead of numbers) can result in copious warning messages
347under C<-w>.
348
349If the regexp contains memory parentheses (C<(...)> rather than C<(?:...)>),
350split will function in "delimiter retention" mode, capturing the
351contents of the parentheses as well as the stuff between the delimiters.
352I could imagine how this could be useful, but on the other hand I
353could also imagine how it could be confusing if encountered unexpectedly.
354Caveat sortor.
355
356Not really a bug, but if you are planning to sort a large text file,
357consider using sort(1).  Unless, of course, your operating system
358doesn't have sort(1).
359
360=head1 AUTHOR
361
362Joseph N. Hall, C<< <joseph@5sigma.com> >>
363
364=head1 SEE ALSO
365
366perl(1).
367
368=cut
369