1# -*- indent-tabs-mode: nil; -*- 2# vim:ft=perl:et:sw=4 3# $Id$ 4 5package Sympa::WWW::Marc::Search; 6 7use strict; 8use warnings; 9use Encode qw(); 10use English qw(-no_match_vars); 11use File::Find qw(); 12use HTML::Entities qw(); 13 14use base qw(Sympa::WWW::Marc); 15 16our $VERSION = "4.3+Sympa-6.2"; 17our ($AUTOLOAD, @MSGFILES); 18 19##------------------------------------------------------------------------## 20## Constructor 21 22my %fields = ( 23 age => 0, 24 archive_name => undef, 25 base_href => undef, 26 body => undef, 27 body_count => 0, 28 case => 0, 29 clean_words => undef, 30 date => undef, 31 date_count => 0, 32 directories => undef, 33 error => undef, 34 file_count => 0, 35 from => undef, 36 from_count => 0, 37 function1 => undef, 38 function2 => undef, 39 how => undef, 40 id => undef, 41 id_count => 0, 42 key_word => undef, 43 limit => 25, 44 match => 0, 45 previous => undef, 46 res => undef, 47 searched => 0, 48 search_base => undef, 49 subj => undef, 50 subj_count => 0, 51 words => undef, 52); 53 54sub new { 55 my $class = shift; 56 my $self = Sympa::WWW::Marc->new(\%fields); 57 bless $self, $class; 58 return $self; 59} 60 61##------------------------------------------------------------------------## 62## These accessor methods keep a running count of matches in each area 63## PUBLIC METHOD 64 65sub body_count { 66 my $self = shift; 67 my $count = shift || 0; 68 return $self->{body_count} += $count; 69} 70 71sub id_count { 72 my $self = shift; 73 my $count = shift || 0; 74 return $self->{id_count} += $count; 75} 76 77sub date_count { 78 my $self = shift; 79 my $count = shift || 0; 80 return $self->{date_count} += $count; 81} 82 83sub from_count { 84 my $self = shift; 85 my $count = shift || 0; 86 return $self->{from_count} += $count; 87} 88 89sub subj_count { 90 my $self = shift; 91 my $count = shift || 0; 92 return $self->{subj_count} += $count; 93} 94 95sub key_word { 96 my $self = shift; 97 98 if (scalar @_) { 99 my $key_word = shift; 100 if (defined $key_word) { 101 $key_word = Encode::decode_utf8($key_word) 102 unless Encode::is_utf8($key_word); 103 $self->{'key_word'} = $key_word; 104 } else { 105 $self->{'key_word'} = undef; 106 } 107 } 108 return $self->{'key_word'}; 109} 110 111##------------------------------------------------------------------------## 112## Handle Actual Search 113## PRIVATE METHOD 114 115sub _find_match { 116 my ($self, $file, $subj, $from, $date, $id, $body_ref) = @_; 117 my $body_string = ''; 118 my $match = 0; 119 my $res = undef; 120 121 # Check for a match in subject 122 if (($self->subj) && ($_ = $subj) && (&{$self->{function2}})) { 123 $subj =~ s,($self->{key_word}),\001$1\002,g; # Bold any matches 124 $self->subj_count(1); # Keeping count 125 $match = 1; # We'll be printing this one 126 } 127 # Check for a match in from 128 if (($self->from) && ($_ = $from) && (&{$self->{function2}})) { 129 $from =~ s,($self->{key_word}),\001$1\002,g; 130 $self->from_count(1); 131 $match = 1; 132 } 133 # Check for a match in date 134 if (($self->date) && ($_ = $date) && (&{$self->{function2}})) { 135 $date =~ s,($self->{key_word}),\001$1\002,g; 136 $self->date_count(1); 137 $match = 1; 138 } 139 # Check for a match in id 140 if (($self->id) && ($_ = $id) && (&{$self->{function2}})) { 141 $id =~ s,($self->{key_word}),\001$1\002,g; 142 $self->id_count(1); 143 $match = 1; 144 } 145 # Is this a full? 146 if (defined($body_ref)) { 147 my @body = @$body_ref; 148 # use routine generated by body_match_all 149 if (defined($self->function1)) { 150 my @words = @{$self->words}; 151 my $i; 152 BODY: for $i (0 .. $#body) { 153 my %matches = (); 154 my $hit = ''; 155 $_ = $body[$i]; 156 my @linematches = &{$self->{function1}}; 157 foreach $hit (@linematches) { 158 # key=searchterm; value=line 159 $matches{$hit} = $i; 160 } 161 # all keys = all terms? 162 if (keys %matches == @words) { 163 # Add to the running total 164 $self->body_count(1); 165 my $line; 166 $match = 1; 167 foreach $hit ( 168 sort { $matches{$a} <=> $matches{$b} } 169 keys %matches 170 ) { 171 # no duplicates please 172 next if ($matches{$hit} + 1 == $line); 173 # arrays start from 0 174 $line = $matches{$hit} + 1; 175 $body_string .= "line $line: $body[$matches{$hit}]"; 176 } 177 $body_string =~ s,($self->{key_word}),\001$1\002,g; 178 last BODY; 179 } 180 } 181 } 182 # otherwise use routine supplied by match_any or match_this 183 else { 184 my $i; 185 BODY: for $i (0 .. $#body) { 186 if (($_ = $body[$i]) && (&{$self->{function2}})) { 187 $body_string = 188 ($i == 0 ? '' : $body[$i - 1]) 189 . $body[$i] 190 . ($i == $#body ? '' : $body[$i + 1]); 191 $body_string =~ s,($self->{key_word}),\001$1\002,g; 192 $self->body_count(1); 193 $match = 1; 194 last BODY; 195 } 196 } 197 } 198 } 199 if ($match == 1) { 200 $file =~ s,$self->{'search_base'},$self->{'base_href'},; 201 $res->{'file'} = $file; 202 $res->{'body_string'} = $body_string; 203 $res->{'id'} = $id; 204 $res->{'date'} = $date; 205 $res->{'from'} = $from; 206 $res->{'subj'} = $subj; 207 $res->{'rich'} = {}; 208 209 foreach my $k (qw(body_string id date from subj)) { 210 my @rich = (); 211 foreach my $s (split /(\n|\001.*?\002)/, $res->{$k}) { 212 next unless length $s; 213 if ($s =~ /\n/) { 214 push @rich, {'text' => '', 'format' => 'br'}; 215 } elsif ($s =~ /\001(.*)\002/) { 216 push @rich, 217 {'text' => Encode::encode_utf8($1), 'format' => 'b'}; 218 } else { 219 push @rich, 220 {'text' => Encode::encode_utf8($s), 'format' => ''}; 221 } 222 } 223 $res->{'rich'}->{$k} = \@rich; 224 $res->{$k} = HTML::Entities::encode_entities($res->{$k}, '<>&"'); 225 $res->{$k} =~ s,\001,<B>,g; 226 $res->{$k} =~ s,\002,</B>,g; 227 $res->{$k} =~ s,\n,<BR/>,g; 228 $res->{$k} = Encode::encode_utf8($res->{$k}); 229 } 230 push @{$self->{'res'}}, $res; 231 } 232 233 return $match; # 1 if match suceeds; 0 otherwise 234} 235 236##------------------------------------------------------------------------## 237## Build up a list of files to search; read in the relevant portions; 238## pass those parts off for checking (and printing if there's a match) 239## by the _find_match method 240## PUBLIC METHOD 241 242sub search { 243 my $self = shift; 244 my $limit = $self->limit; 245 my $previous = $self->previous || 0; 246 my $directories = $self->directories; 247 my $body = $self->body || 0; 248 249 @MSGFILES = ''; 250 251 my @directories = split /\0/, $directories; 252 foreach my $dir (@directories) { 253 my $directory = ($self->search_base . '/' . $dir . '/'); 254 File::Find::find( 255 { wanted => \&_get_file_list, 256 untaint => 1, 257 untaint_pattern => qr|^([-@\w./]+)$| 258 }, 259 $directory 260 ); 261 } 262 # File::Find returns these in somewhat haphazard order. 263 @MSGFILES = sort @MSGFILES; 264 265 # Newest files first! 266 @MSGFILES = reverse(@MSGFILES) if $self->age; 267 268 # The *real* number of files 269 $self->file_count($#MSGFILES); 270 271 @MSGFILES = splice(@MSGFILES, $previous) if $previous; 272 my $file; 273 my $i = 1; # Arrays are numbered from 0 274 # Avoid doing a lot of extra math inside the loop 275 $limit += $previous; 276 foreach $file (@MSGFILES) { 277 my ($subj, $from, $date, $id, $body_ref); 278 my $fh; 279 280 # Use encoding(utf8) input layer to perform Unicode case-insensitive 281 # match. 282 next unless open $fh, '<:encoding(utf8)', $file; 283 284 # Need this loop because newer versions of MHonArc put a version 285 # number on the first line of the message. Just in case Earl 286 # decides to change this again, we will loop until the subject 287 # comment tag is found. Thanks to Douglas Gray Stephens for 288 # pointing this out, and more importantly, for suggesting a good 289 # solution (though ultimately not the one in place here). That 290 # DGS was able to contribute to this modest little program is, I 291 # think, a good argument in favor of open source code! 292 while (<$fh>) { 293 ## Next line is appended to the subject 294 if (defined $subj) { 295 $subj .= $1 if (/\s(.*)( -->|$)/); 296 if (/-->$/) { 297 $subj =~ s/ -->$//; 298 last; 299 } 300 } elsif (/^<!--X-Subject: (.*)( -->|$)/) { 301 ## No more need to decode header fields 302 # $subj = &MIME::Words::decode_mimewords($1); 303 $subj = $1; 304 last if (/-->/); 305 } 306 } 307 308 # If $subj is undefined, <$fh> will be undefined thus going further 309 # is useless 310 next unless defined $subj; 311 312 $subj =~ s/ *-->$//; 313 314 ($from = <$fh>) =~ s/^<!--X-From-R13: (.*) -->/$1/; 315 316 ## No more need to decode header fields 317 #$from = &MIME::Words::decode_mimewords($from); 318 319 $from =~ tr/N-Z[@A-Mn-za-m/@A-Z[a-z/; 320 321 ($date = <$fh>) =~ s/^<!--X-Date: (.*) -->/$1/; 322 323 ($id = <$fh>) =~ s/^<!--X-Message-Id: (.*) -->/$1/; 324 325 if ($body) { 326 my $lines = ''; 327 while (<$fh>) { 328 # Messages are contained between Body-of-Message tags 329 next unless (/^<!--X-Body-of-Message-->/); 330 $_ = <$fh>; 331 while (!eof && ($_ !~ /^<!--X-MsgBody-End-->/)) { 332 $lines .= $_; 333 $_ = <$fh>; 334 } 335 last; 336 } 337 # Remove HTML comments 338 $lines =~ s/<!--[^<>]*?-->//g; 339 # Translate newlines 340 $lines =~ s{<PRE\b[^>]*>(.*?)</PRE\b[^>]*>} 341 { my $s = $1; $s =~ s,\r\n|\r|\n,<BR/>,g; $s; }egis; 342 $lines =~ s/[\r\n]/ /g; 343 $lines =~ s/<(BR|DIV|P)\b[^>]*>[ \t]*/\n/gi; 344 # Remove other HTML tags 345 $lines =~ s,[ \t]*</[^>]*>,,g; 346 $lines =~ s/<[^>]*>[ \t]*//g; 347 $lines =~ s/[<>]/ /g; 348 # Decode entities 349 $lines = HTML::Entities::decode_entities($lines); 350 $lines =~ s/[\001\002]/ /g; 351 # Split lines 352 $body_ref = [split /(?<=\n)/, $lines]; 353 } 354 close $fh; 355 356 # Decode entities 357 if ($subj) { 358 $subj = HTML::Entities::decode_entities($subj); 359 $subj =~ s/[\001\002\r\n]/ /g; 360 } 361 if ($from) { 362 $from = HTML::Entities::decode_entities($from); 363 $from =~ s/[\001\002\r\n]/ /g; 364 } 365 if ($date) { 366 $date = HTML::Entities::decode_entities($date); 367 $date =~ s/[\001\002\r\n]/ /g; 368 } 369 if ($id) { 370 $id = HTML::Entities::decode_entities($id); 371 $id =~ s/[\001\002\r\n]/ /g; 372 } 373 374 if ($self->_find_match($file, $subj, $from, $date, $id, $body_ref)) { 375 return ($i + $previous) 376 if ($self->body_count == $limit 377 or $self->subj_count == $limit 378 or $self->from_count == $limit 379 or $self->date_count == $limit 380 or $self->id_count == $limit); 381 } 382 $i++; 383 } 384 385 return $self->file_count + 1; 386} 387 388##------------------------------------------------------------------------## 389## Function for use with File::Find -- recursive 390## PRIVATE METHOD 391 392sub _get_file_list { 393 /^msg/ && push @MSGFILES, $File::Find::name; 394} 395 396##------------------------------------------------------------------------## 397## Eval anonymous pattern match functions based on user search terms 398 399## PUBLIC METHOD 400sub match_any { 401 my $self = shift; 402 my ($tail, $pat); 403 if ($self->case) { $tail = '/i' } 404 else { $tail = '/' } 405 my $code = <<EOCODE; 406sub { 407 use utf8; 408EOCODE 409 $code .= <<EOCODE if @_ > 5; 410 study; 411EOCODE 412 for $pat (@_) { 413 $code .= <<EOCODE; 414 return 1 if /$pat$tail; 415EOCODE 416 } 417 $code .= "}\n"; 418 my $function = eval $code; 419 die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR; 420 return $function; 421} 422 423## PUBLIC METHOD 424sub body_match_all { 425 my ($self, @ret) = @_; 426 my ($len) = ($#ret + 1) / 2; 427 my (@pat) = splice(@ret, $len); 428 my $tail; 429 if ($self->case) { $tail = '/i' } 430 else { $tail = '/' } 431 432 # Quick fix: Escape non-words. 433 foreach my $ret (@ret) { 434 $ret =~ s/([^\x00-\x1F\s\w\x7F-\xFF])/\\$1/g; 435 } 436 437 my $code = <<EOCODE; 438sub { 439 use utf8; 440 my(\@matches); 441EOCODE 442 $code .= <<EOCODE if @pat > 5; 443 study; 444EOCODE 445 my $i; 446 447 for $i (0 .. $#pat) { 448 $code .= <<EOCODE; 449 push \@matches, '$ret[$i]' if /$pat[$i]$tail; 450EOCODE 451 } 452 $code .= <<EOCODE; 453 return \@matches; 454} 455EOCODE 456# print "<PRE>$code</pre>"; # used for debugging 457 my $function = eval $code; 458 die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR; 459 return $function; 460} 461 462## PUBLIC METHOD 463sub match_all { 464 my $self = shift; 465 my ($sep, $tail); 466 if ($self->case) { 467 $sep = "/i && /"; 468 $tail = "/i }"; 469 } else { 470 $sep = "/ && /"; 471 $tail = "/ }"; 472 } 473 my $code = "sub { use utf8; /" . join("$sep", @_) . $tail; 474 my $function = eval $code; 475 die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR; 476 return $function; 477} 478 479## PUBLIC METHOD 480sub match_this { 481 my $self = shift; 482 my $string = join '\s+', @_; 483 $string = '(?i)' . $string if ($self->case); 484 my $code = "sub { use utf8; /" . $string . "/ }"; 485 my $function = eval $code; 486 die "bad pattern: $EVAL_ERROR" if $EVAL_ERROR; 487 return $function; 488} 489 4901; 491__END__ 492 493=encoding utf-8 494 495=head1 NAME 496 497Sympa::WWW::Marc::Search - Search archives of Sympa 498 499=head1 SYNOPSIS 500 501TBD. 502 503=head1 DESCRIPTION 504 505TBD. 506 507=head1 HISTORY 508 509L<Sympa::WWW::Marc::Search> was originally taken from 510L<Marc::Search> in MHonArc Search Engine by Eric D. Friedman: 511L<http://www.mhonarc.org/contrib/marc-search/>. 512 513=cut 514