# -*- indent-tabs-mode: nil; -*-
# vim:ft=perl:et:sw=4
# $Id$
package Sympa::WWW::Marc::Search;
use strict;
use warnings;
use Encode qw();
use English qw(-no_match_vars);
use File::Find qw();
use HTML::Entities qw();
use base qw(Sympa::WWW::Marc);
our $VERSION = "4.3+Sympa-6.2";
our ($AUTOLOAD, @MSGFILES);
##------------------------------------------------------------------------##
## Constructor
my %fields = (
age => 0,
archive_name => undef,
base_href => undef,
body => undef,
body_count => 0,
case => 0,
clean_words => undef,
date => undef,
date_count => 0,
directories => undef,
error => undef,
file_count => 0,
from => undef,
from_count => 0,
function1 => undef,
function2 => undef,
how => undef,
id => undef,
id_count => 0,
key_word => undef,
limit => 25,
match => 0,
previous => undef,
res => undef,
searched => 0,
search_base => undef,
subj => undef,
subj_count => 0,
words => undef,
);
sub new {
my $class = shift;
my $self = Sympa::WWW::Marc->new(\%fields);
bless $self, $class;
return $self;
}
##------------------------------------------------------------------------##
## These accessor methods keep a running count of matches in each area
## PUBLIC METHOD
sub body_count {
my $self = shift;
my $count = shift || 0;
return $self->{body_count} += $count;
}
sub id_count {
my $self = shift;
my $count = shift || 0;
return $self->{id_count} += $count;
}
sub date_count {
my $self = shift;
my $count = shift || 0;
return $self->{date_count} += $count;
}
sub from_count {
my $self = shift;
my $count = shift || 0;
return $self->{from_count} += $count;
}
sub subj_count {
my $self = shift;
my $count = shift || 0;
return $self->{subj_count} += $count;
}
sub key_word {
my $self = shift;
if (scalar @_) {
my $key_word = shift;
if (defined $key_word) {
$key_word = Encode::decode_utf8($key_word)
unless Encode::is_utf8($key_word);
$self->{'key_word'} = $key_word;
} else {
$self->{'key_word'} = undef;
}
}
return $self->{'key_word'};
}
##------------------------------------------------------------------------##
## Handle Actual Search
## PRIVATE METHOD
sub _find_match {
my ($self, $file, $subj, $from, $date, $id, $body_ref) = @_;
my $body_string = '';
my $match = 0;
my $res = undef;
# Check for a match in subject
if (($self->subj) && ($_ = $subj) && (&{$self->{function2}})) {
$subj =~ s,($self->{key_word}),\001$1\002,g; # Bold any matches
$self->subj_count(1); # Keeping count
$match = 1; # We'll be printing this one
}
# Check for a match in from
if (($self->from) && ($_ = $from) && (&{$self->{function2}})) {
$from =~ s,($self->{key_word}),\001$1\002,g;
$self->from_count(1);
$match = 1;
}
# Check for a match in date
if (($self->date) && ($_ = $date) && (&{$self->{function2}})) {
$date =~ s,($self->{key_word}),\001$1\002,g;
$self->date_count(1);
$match = 1;
}
# Check for a match in id
if (($self->id) && ($_ = $id) && (&{$self->{function2}})) {
$id =~ s,($self->{key_word}),\001$1\002,g;
$self->id_count(1);
$match = 1;
}
# Is this a full?
if (defined($body_ref)) {
my @body = @$body_ref;
# use routine generated by body_match_all
if (defined($self->function1)) {
my @words = @{$self->words};
my $i;
BODY: for $i (0 .. $#body) {
my %matches = ();
my $hit = '';
$_ = $body[$i];
my @linematches = &{$self->{function1}};
foreach $hit (@linematches) {
# key=searchterm; value=line
$matches{$hit} = $i;
}
# all keys = all terms?
if (keys %matches == @words) {
# Add to the running total
$self->body_count(1);
my $line;
$match = 1;
foreach $hit (
sort { $matches{$a} <=> $matches{$b} }
keys %matches
) {
# no duplicates please
next if ($matches{$hit} + 1 == $line);
# arrays start from 0
$line = $matches{$hit} + 1;
$body_string .= "line $line: $body[$matches{$hit}]";
}
$body_string =~ s,($self->{key_word}),\001$1\002,g;
last BODY;
}
}
}
# otherwise use routine supplied by match_any or match_this
else {
my $i;
BODY: for $i (0 .. $#body) {
if (($_ = $body[$i]) && (&{$self->{function2}})) {
$body_string =
($i == 0 ? '' : $body[$i - 1])
. $body[$i]
. ($i == $#body ? '' : $body[$i + 1]);
$body_string =~ s,($self->{key_word}),\001$1\002,g;
$self->body_count(1);
$match = 1;
last BODY;
}
}
}
}
if ($match == 1) {
$file =~ s,$self->{'search_base'},$self->{'base_href'},;
$res->{'file'} = $file;
$res->{'body_string'} = $body_string;
$res->{'id'} = $id;
$res->{'date'} = $date;
$res->{'from'} = $from;
$res->{'subj'} = $subj;
$res->{'rich'} = {};
foreach my $k (qw(body_string id date from subj)) {
my @rich = ();
foreach my $s (split /(\n|\001.*?\002)/, $res->{$k}) {
next unless length $s;
if ($s =~ /\n/) {
push @rich, {'text' => '', 'format' => 'br'};
} elsif ($s =~ /\001(.*)\002/) {
push @rich,
{'text' => Encode::encode_utf8($1), 'format' => 'b'};
} else {
push @rich,
{'text' => Encode::encode_utf8($s), 'format' => ''};
}
}
$res->{'rich'}->{$k} = \@rich;
$res->{$k} = HTML::Entities::encode_entities($res->{$k}, '<>&"');
$res->{$k} =~ s,\001,,g;
$res->{$k} =~ s,\002,,g;
$res->{$k} =~ s,\n,
,g;
$res->{$k} = Encode::encode_utf8($res->{$k});
}
push @{$self->{'res'}}, $res;
}
return $match; # 1 if match suceeds; 0 otherwise
}
##------------------------------------------------------------------------##
## Build up a list of files to search; read in the relevant portions;
## pass those parts off for checking (and printing if there's a match)
## by the _find_match method
## PUBLIC METHOD
sub search {
my $self = shift;
my $limit = $self->limit;
my $previous = $self->previous || 0;
my $directories = $self->directories;
my $body = $self->body || 0;
@MSGFILES = '';
my @directories = split /\0/, $directories;
foreach my $dir (@directories) {
my $directory = ($self->search_base . '/' . $dir . '/');
File::Find::find(
{ wanted => \&_get_file_list,
untaint => 1,
untaint_pattern => qr|^([-@\w./]+)$|
},
$directory
);
}
# File::Find returns these in somewhat haphazard order.
@MSGFILES = sort @MSGFILES;
# Newest files first!
@MSGFILES = reverse(@MSGFILES) if $self->age;
# The *real* number of files
$self->file_count($#MSGFILES);
@MSGFILES = splice(@MSGFILES, $previous) if $previous;
my $file;
my $i = 1; # Arrays are numbered from 0
# Avoid doing a lot of extra math inside the loop
$limit += $previous;
foreach $file (@MSGFILES) {
my ($subj, $from, $date, $id, $body_ref);
my $fh;
# Use encoding(utf8) input layer to perform Unicode case-insensitive
# match.
next unless open $fh, '<:encoding(utf8)', $file;
# Need this loop because newer versions of MHonArc put a version
# number on the first line of the message. Just in case Earl
# decides to change this again, we will loop until the subject
# comment tag is found. Thanks to Douglas Gray Stephens for
# pointing this out, and more importantly, for suggesting a good
# solution (though ultimately not the one in place here). That
# DGS was able to contribute to this modest little program is, I
# think, a good argument in favor of open source code!
while (<$fh>) {
## Next line is appended to the subject
if (defined $subj) {
$subj .= $1 if (/\s(.*)( -->|$)/);
if (/-->$/) {
$subj =~ s/ -->$//;
last;
}
} elsif (/^|$)/) {
## No more need to decode header fields
# $subj = &MIME::Words::decode_mimewords($1);
$subj = $1;
last if (/-->/);
}
}
# If $subj is undefined, <$fh> will be undefined thus going further
# is useless
next unless defined $subj;
$subj =~ s/ *-->$//;
($from = <$fh>) =~ s/^/$1/;
## No more need to decode header fields
#$from = &MIME::Words::decode_mimewords($from);
$from =~ tr/N-Z[@A-Mn-za-m/@A-Z[a-z/;
($date = <$fh>) =~ s/^/$1/;
($id = <$fh>) =~ s/^/$1/;
if ($body) {
my $lines = '';
while (<$fh>) {
# Messages are contained between Body-of-Message tags
next unless (/^/);
$_ = <$fh>;
while (!eof && ($_ !~ /^/)) {
$lines .= $_;
$_ = <$fh>;
}
last;
}
# Remove HTML comments
$lines =~ s///g;
# Translate newlines
$lines =~ s{
]*>(.*?)]*>} { my $s = $1; $s =~ s,\r\n|\r|\n,