1# -*- indent-tabs-mode: nil; -*-
2# vim:ft=perl:et:sw=4
3# $Id$
4
5package Sympa::WWW::Marc::Search;
6
7use strict;
8use warnings;
9use Encode qw();
10use English qw(-no_match_vars);
11use File::Find qw();
12use HTML::Entities qw();
13
14use base qw(Sympa::WWW::Marc);
15
16our $VERSION = "4.3+Sympa-6.2";
17our ($AUTOLOAD, @MSGFILES);
18
19##------------------------------------------------------------------------##
20## Constructor
21
22my %fields = (
23    age          => 0,
24    archive_name => undef,
25    base_href    => undef,
26    body         => undef,
27    body_count   => 0,
28    case         => 0,
29    clean_words  => undef,
30    date         => undef,
31    date_count   => 0,
32    directories  => undef,
33    error        => undef,
34    file_count   => 0,
35    from         => undef,
36    from_count   => 0,
37    function1    => undef,
38    function2    => undef,
39    how          => undef,
40    id           => undef,
41    id_count     => 0,
42    key_word     => undef,
43    limit        => 25,
44    match        => 0,
45    previous     => undef,
46    res          => undef,
47    searched     => 0,
48    search_base  => undef,
49    subj         => undef,
50    subj_count   => 0,
51    words        => undef,
52);
53
54sub new {
55    my $class = shift;
56    my $self  = Sympa::WWW::Marc->new(\%fields);
57    bless $self, $class;
58    return $self;
59}
60
61##------------------------------------------------------------------------##
62## These accessor methods keep a running count of matches in each area
63## PUBLIC METHOD
64
65sub body_count {
66    my $self = shift;
67    my $count = shift || 0;
68    return $self->{body_count} += $count;
69}
70
71sub id_count {
72    my $self = shift;
73    my $count = shift || 0;
74    return $self->{id_count} += $count;
75}
76
77sub date_count {
78    my $self = shift;
79    my $count = shift || 0;
80    return $self->{date_count} += $count;
81}
82
83sub from_count {
84    my $self = shift;
85    my $count = shift || 0;
86    return $self->{from_count} += $count;
87}
88
89sub subj_count {
90    my $self = shift;
91    my $count = shift || 0;
92    return $self->{subj_count} += $count;
93}
94
95sub key_word {
96    my $self = shift;
97
98    if (scalar @_) {
99        my $key_word = shift;
100        if (defined $key_word) {
101            $key_word = Encode::decode_utf8($key_word)
102                unless Encode::is_utf8($key_word);
103            $self->{'key_word'} = $key_word;
104        } else {
105            $self->{'key_word'} = undef;
106        }
107    }
108    return $self->{'key_word'};
109}
110
111##------------------------------------------------------------------------##
112## Handle Actual Search
113## PRIVATE METHOD
114
115sub _find_match {
116    my ($self, $file, $subj, $from, $date, $id, $body_ref) = @_;
117    my $body_string = '';
118    my $match       = 0;
119    my $res         = undef;
120
121    # Check for a match in subject
122    if (($self->subj) && ($_ = $subj) && (&{$self->{function2}})) {
123        $subj =~ s,($self->{key_word}),\001$1\002,g;    # Bold any matches
124        $self->subj_count(1);                           # Keeping count
125        $match = 1;    # We'll be printing this one
126    }
127    # Check for a match in from
128    if (($self->from) && ($_ = $from) && (&{$self->{function2}})) {
129        $from =~ s,($self->{key_word}),\001$1\002,g;
130        $self->from_count(1);
131        $match = 1;
132    }
133    # Check for a match in date
134    if (($self->date) && ($_ = $date) && (&{$self->{function2}})) {
135        $date =~ s,($self->{key_word}),\001$1\002,g;
136        $self->date_count(1);
137        $match = 1;
138    }
139    # Check for a match in id
140    if (($self->id) && ($_ = $id) && (&{$self->{function2}})) {
141        $id =~ s,($self->{key_word}),\001$1\002,g;
142        $self->id_count(1);
143        $match = 1;
144    }
145    # Is this a full?
146    if (defined($body_ref)) {
147        my @body = @$body_ref;
148        # use routine generated by body_match_all
149        if (defined($self->function1)) {
150            my @words = @{$self->words};
151            my $i;
152        BODY: for $i (0 .. $#body) {
153                my %matches = ();
154                my $hit     = '';
155                $_ = $body[$i];
156                my @linematches = &{$self->{function1}};
157                foreach $hit (@linematches) {
158                    # key=searchterm; value=line
159                    $matches{$hit} = $i;
160                }
161                # all keys = all terms?
162                if (keys %matches == @words) {
163                    # Add to the running total
164                    $self->body_count(1);
165                    my $line;
166                    $match = 1;
167                    foreach $hit (
168                        sort { $matches{$a} <=> $matches{$b} }
169                        keys %matches
170                    ) {
171                        # no duplicates please
172                        next if ($matches{$hit} + 1 == $line);
173                        # arrays start from 0
174                        $line = $matches{$hit} + 1;
175                        $body_string .= "line $line: $body[$matches{$hit}]";
176                    }
177                    $body_string =~ s,($self->{key_word}),\001$1\002,g;
178                    last BODY;
179                }
180            }
181        }
182        # otherwise use routine supplied by match_any or match_this
183        else {
184            my $i;
185        BODY: for $i (0 .. $#body) {
186                if (($_ = $body[$i]) && (&{$self->{function2}})) {
187                    $body_string =
188                          ($i == 0 ? '' : $body[$i - 1])
189                        . $body[$i]
190                        . ($i == $#body ? '' : $body[$i + 1]);
191                    $body_string =~ s,($self->{key_word}),\001$1\002,g;
192                    $self->body_count(1);
193                    $match = 1;
194                    last BODY;
195                }
196            }
197        }
198    }
199    if ($match == 1) {
200        $file =~ s,$self->{'search_base'},$self->{'base_href'},;
201        $res->{'file'}        = $file;
202        $res->{'body_string'} = $body_string;
203        $res->{'id'}          = $id;
204        $res->{'date'}        = $date;
205        $res->{'from'}        = $from;
206        $res->{'subj'}        = $subj;
207        $res->{'rich'}        = {};
208
209        foreach my $k (qw(body_string id date from subj)) {
210            my @rich = ();
211            foreach my $s (split /(\n|\001.*?\002)/, $res->{$k}) {
212                next unless length $s;
213                if ($s =~ /\n/) {
214                    push @rich, {'text' => '', 'format' => 'br'};
215                } elsif ($s =~ /\001(.*)\002/) {
216                    push @rich,
217                        {'text' => Encode::encode_utf8($1), 'format' => 'b'};
218                } else {
219                    push @rich,
220                        {'text' => Encode::encode_utf8($s), 'format' => ''};
221                }
222            }
223            $res->{'rich'}->{$k} = \@rich;
224            $res->{$k} = HTML::Entities::encode_entities($res->{$k}, '<>&"');
225            $res->{$k} =~ s,\001,<B>,g;
226            $res->{$k} =~ s,\002,</B>,g;
227            $res->{$k} =~ s,\n,<BR/>,g;
228            $res->{$k} = Encode::encode_utf8($res->{$k});
229        }
230        push @{$self->{'res'}}, $res;
231    }
232
233    return $match;    # 1 if match suceeds; 0 otherwise
234}
235
236##------------------------------------------------------------------------##
237## Build up a list of files to search; read in the relevant portions;
238## pass those parts off for checking (and printing if there's a match)
239## by the _find_match method
240## PUBLIC METHOD
241
242sub search {
243    my $self        = shift;
244    my $limit       = $self->limit;
245    my $previous    = $self->previous || 0;
246    my $directories = $self->directories;
247    my $body        = $self->body || 0;
248
249    @MSGFILES = '';
250
251    my @directories = split /\0/, $directories;
252    foreach my $dir (@directories) {
253        my $directory = ($self->search_base . '/' . $dir . '/');
254        File::Find::find(
255            {   wanted          => \&_get_file_list,
256                untaint         => 1,
257                untaint_pattern => qr|^([-@\w./]+)$|
258            },
259            $directory
260        );
261    }
262    # File::Find returns these in somewhat haphazard order.
263    @MSGFILES = sort @MSGFILES;
264
265    # Newest files first!
266    @MSGFILES = reverse(@MSGFILES) if $self->age;
267
268    # The *real* number of files
269    $self->file_count($#MSGFILES);
270
271    @MSGFILES = splice(@MSGFILES, $previous) if $previous;
272    my $file;
273    my $i = 1;    # Arrays are numbered from 0
274    # Avoid doing a lot of extra math inside the loop
275    $limit += $previous;
276    foreach $file (@MSGFILES) {
277        my ($subj, $from, $date, $id, $body_ref);
278        my $fh;
279
280        # Use encoding(utf8) input layer to perform Unicode case-insensitive
281        # match.
282        next unless open $fh, '<:encoding(utf8)', $file;
283
284        # Need this loop because newer versions of MHonArc put a version
285        # number on the first line of the message.  Just in case Earl
286        # decides to change this again, we will loop until the subject
287        # comment tag is found.  Thanks to Douglas Gray Stephens for
288        # pointing this out, and more importantly, for suggesting a good
289        # solution (though ultimately not the one in place here).  That
290        # DGS was able to contribute to this modest little program is, I
291        # think, a good argument in favor of open source code!
292        while (<$fh>) {
293            ## Next line is appended to the subject
294            if (defined $subj) {
295                $subj .= $1 if (/\s(.*)( -->|$)/);
296                if (/-->$/) {
297                    $subj =~ s/ -->$//;
298                    last;
299                }
300            } elsif (/^<!--X-Subject: (.*)( -->|$)/) {
301                ## No more need to decode header fields
302                # $subj = &MIME::Words::decode_mimewords($1);
303                $subj = $1;
304                last if (/-->/);
305            }
306        }
307
308        # If $subj is undefined, <$fh> will be undefined thus going further
309        # is useless
310        next unless defined $subj;
311
312        $subj =~ s/ *-->$//;
313
314        ($from = <$fh>) =~ s/^<!--X-From-R13: (.*) -->/$1/;
315
316        ## No more need to decode header fields
317        #$from = &MIME::Words::decode_mimewords($from);
318
319        $from =~ tr/N-Z[@A-Mn-za-m/@A-Z[a-z/;
320
321        ($date = <$fh>) =~ s/^<!--X-Date: (.*) -->/$1/;
322
323        ($id = <$fh>) =~ s/^<!--X-Message-Id: (.*) -->/$1/;
324
325        if ($body) {
326            my $lines = '';
327            while (<$fh>) {
328                # Messages are contained between Body-of-Message tags
329                next unless (/^<!--X-Body-of-Message-->/);
330                $_ = <$fh>;
331                while (!eof && ($_ !~ /^<!--X-MsgBody-End-->/)) {
332                    $lines .= $_;
333                    $_ = <$fh>;
334                }
335                last;
336            }
337            # Remove HTML comments
338            $lines =~ s/<!--[^<>]*?-->//g;
339            # Translate newlines
340            $lines =~ s{<PRE\b[^>]*>(.*?)</PRE\b[^>]*>}
341					   { my $s = $1; $s =~ s,\r\n|\r|\n,<BR/>,g; $s; }egis;
342            $lines =~ s/[\r\n]/ /g;
343            $lines =~ s/<(BR|DIV|P)\b[^>]*>[ \t]*/\n/gi;
344            # Remove other HTML tags
345            $lines =~ s,[ \t]*</[^>]*>,,g;
346            $lines =~ s/<[^>]*>[ \t]*//g;
347            $lines =~ s/[<>]/ /g;
348            # Decode entities
349            $lines = HTML::Entities::decode_entities($lines);
350            $lines =~ s/[\001\002]/ /g;
351            # Split lines
352            $body_ref = [split /(?<=\n)/, $lines];
353        }
354        close $fh;
355
356        # Decode entities
357        if ($subj) {
358            $subj = HTML::Entities::decode_entities($subj);
359            $subj =~ s/[\001\002\r\n]/ /g;
360        }
361        if ($from) {
362            $from = HTML::Entities::decode_entities($from);
363            $from =~ s/[\001\002\r\n]/ /g;
364        }
365        if ($date) {
366            $date = HTML::Entities::decode_entities($date);
367            $date =~ s/[\001\002\r\n]/ /g;
368        }
369        if ($id) {
370            $id = HTML::Entities::decode_entities($id);
371            $id =~ s/[\001\002\r\n]/ /g;
372        }
373
374        if ($self->_find_match($file, $subj, $from, $date, $id, $body_ref)) {
375            return ($i + $previous)
376                if ($self->body_count == $limit
377                or $self->subj_count == $limit
378                or $self->from_count == $limit
379                or $self->date_count == $limit
380                or $self->id_count == $limit);
381        }
382        $i++;
383    }
384
385    return $self->file_count + 1;
386}
387
388##------------------------------------------------------------------------##
389## Function for use with File::Find -- recursive
390## PRIVATE METHOD
391
392sub _get_file_list {
393    /^msg/ && push @MSGFILES, $File::Find::name;
394}
395
396##------------------------------------------------------------------------##
397## Eval anonymous pattern match functions based on user search terms
398
399## PUBLIC METHOD
400sub match_any {
401    my $self = shift;
402    my ($tail, $pat);
403    if   ($self->case) { $tail = '/i' }
404    else               { $tail = '/' }
405    my $code = <<EOCODE;
406sub {
407      use utf8;
408EOCODE
409    $code .= <<EOCODE if @_ > 5;
410      study;
411EOCODE
412    for $pat (@_) {
413        $code .= <<EOCODE;
414      return 1 if /$pat$tail;
415EOCODE
416    }
417    $code .= "}\n";
418    my $function = eval $code;
419    die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR;
420    return $function;
421}
422
423## PUBLIC METHOD
424sub body_match_all {
425    my ($self, @ret) = @_;
426    my ($len) = ($#ret + 1) / 2;
427    my (@pat) = splice(@ret, $len);
428    my $tail;
429    if   ($self->case) { $tail = '/i' }
430    else               { $tail = '/' }
431
432    # Quick fix: Escape non-words.
433    foreach my $ret (@ret) {
434        $ret =~ s/([^\x00-\x1F\s\w\x7F-\xFF])/\\$1/g;
435    }
436
437    my $code = <<EOCODE;
438sub {
439	use utf8;
440	my(\@matches);
441EOCODE
442    $code .= <<EOCODE if @pat > 5;
443	study;
444EOCODE
445    my $i;
446
447    for $i (0 .. $#pat) {
448        $code .= <<EOCODE;
449	push \@matches, '$ret[$i]' if /$pat[$i]$tail;
450EOCODE
451    }
452    $code .= <<EOCODE;
453	return \@matches;
454}
455EOCODE
456#	print "<PRE>$code</pre>";	# used for debugging
457    my $function = eval $code;
458    die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR;
459    return $function;
460}
461
462## PUBLIC METHOD
463sub match_all {
464    my $self = shift;
465    my ($sep, $tail);
466    if ($self->case) {
467        $sep  = "/i && /";
468        $tail = "/i }";
469    } else {
470        $sep  = "/ && /";
471        $tail = "/ }";
472    }
473    my $code = "sub { use utf8; /" . join("$sep", @_) . $tail;
474    my $function = eval $code;
475    die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR;
476    return $function;
477}
478
479## PUBLIC METHOD
480sub match_this {
481    my $self = shift;
482    my $string = join '\s+', @_;
483    $string = '(?i)' . $string if ($self->case);
484    my $code     = "sub { use utf8; /" . $string . "/ }";
485    my $function = eval $code;
486    die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR;
487    return $function;
488}
489
4901;
491__END__
492
493=encoding utf-8
494
495=head1 NAME
496
497Sympa::WWW::Marc::Search - Search archives of Sympa
498
499=head1 SYNOPSIS
500
501TBD.
502
503=head1 DESCRIPTION
504
505TBD.
506
507=head1 HISTORY
508
509L<Sympa::WWW::Marc::Search> was originally taken from
510L<Marc::Search> in MHonArc Search Engine by Eric D. Friedman:
511L<http://www.mhonarc.org/contrib/marc-search/>.
512
513=cut
514