1#! %PERL% -w
2# -*- Perl -*-
3# mknmz - indexer of Namazu
4# $Id: mknmz.in,v 1.85.4.90 2008-06-02 09:48:13 opengl2772 Exp $
5#
6# Copyright (C) 1997-1999 Satoru Takabayashi All rights reserved.
7# Copyright (C) 2000-2008 Namazu Project All rights reserved.
8#     This is free software with ABSOLUTELY NO WARRANTY.
9#
10#  This program is free software; you can redistribute it and/or modify
11#  it under the terms of the GNU General Public License as published by
12#  the Free Software Foundation; either versions 2, or (at your option)
13#  any later version.
14#
15#  This program is distributed in the hope that it will be useful
16#  but WITHOUT ANY WARRANTY; without even the implied warranty of
17#  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18#  GNU General Public License for more details.
19#
20#  You should have received a copy of the GNU General Public License
21#  along with this program; if not, write to the Free Software
22#  Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA
23#  02111-1307, USA
24#
25#  This file must be encoded in EUC-JP encoding
26#
27
28package mknmz;
29require 5.004;
30use English;
31use lib "%ADDITIONAL_INC%";
32use Cwd;
33use IO::File;
34use File::Find;
35use File::MMagic;
36use Time::Local;
37use strict;  # be strict since v1.2.0
38use Getopt::Long;
39use File::Copy;
40use DirHandle;
41use File::Basename;
42
43use vars qw($SYSTEM);
44# It exists only for back compatibility.
45$SYSTEM = $English::OSNAME;
46
47my $NAMAZU_INDEX_VERSION = "2.0";
48
49my $CodingSystem = "euc";
50my $PKGDATADIR    = $ENV{'pkgdatadir'} || "@pkgdatadir@";
51my $CONFDIR       = "@CONFDIR@";     # directory where mknmzrc are in.
52my $LIBDIR        = $PKGDATADIR . "/pl";      # directory where library etc. are in.
53my $FILTERDIR     = $PKGDATADIR . "/filter";   # directory where filters are in.
54my $TEMPLATEDIR   = $PKGDATADIR . "/template"; # directory where templates are in.
55
56my $DeletedFilesCount = 0;
57my $UpdatedFilesCount = 0;
58my $APPENDMODE = 0;
59my %PhraseHash = ();
60my %PhraseHashLast = ();
61my %KeyIndex = ();
62my %KeyIndexLast = ();
63my %CheckPoint = ("on" => undef, "continue" => undef);
64my $ConfigFile = undef;
65my $MediaType  = undef;
66
67my $ReplaceCode  = undef;  # perl code for transforming URI
68my @Seed = ();
69my @LoadedRcfiles = ();
70my $Magic = new File::MMagic;
71
72my $ReceiveTERM = 0;
73
74STDOUT->autoflush(1);
75STDERR->autoflush(1);
76main();
77sub main {
78    my $start_time = time;
79
80    if ($English::PERL_VERSION == 5.008001) {
81        unless (defined $ENV{PERL_HASH_SEED} && $ENV{PERL_HASH_SEED} eq 0) {
82            print "Run mknmz with the environment variable PERL_HASH_SEED=0\n";
83            exit 1;
84        }
85    }
86
87    init();
88
89    # At first, loading pl/conf.pl to prevent overriding some variables.
90    preload_modules();
91
92    # set LANG and bind textdomain
93    util::set_lang();
94    textdomain('namazu', $util::LANG_MSG);
95
96    load_modules();
97    my ($output_dir, @targets) = parse_options();
98    my ($docid_base, $total_files_num) = prep($output_dir, @targets);
99
100    my $swap = 1;
101    my $docid_count = 0;
102    my $file_count = 0;
103    my $total_files_size = 0;
104    my $key_count = 0;
105    my $checkpoint = 0;
106    my $flist_ptr = 0;
107    my $processed_files_size = 0;
108
109    if ($CheckPoint{'continue'}) {
110	# Restore variables
111	eval util::readfile($var::NMZ{'_checkpoint'}) ;
112    } else {
113	print $total_files_num . _(" files are found to be indexed.\n");
114    }
115
116    {
117	my $fh_errorsfile = util::efopen(">>$var::NMZ{'err'}");
118	my $fh_flist = util::efopen($var::NMZ{'_flist'});
119	my %field_indices = ();
120	get_field_index_base(\%field_indices);
121
122	if ($CheckPoint{'continue'}) {
123	    seek($fh_flist, $flist_ptr, 0);
124	}
125
126	# Process target files one by one
127	while (defined(my $line = <$fh_flist>)) {
128	    $flist_ptr += length($line);
129	    my $cfile = $line;
130	    chomp $cfile;
131	    util::dprint(_("target file: ")."$cfile\n");
132
133	    my ($cfile_size, $num) =
134		process_file($cfile, $docid_count, $docid_base,
135			     $file_count, \%field_indices,
136			     $fh_errorsfile, $total_files_num);
137	    if ($num == 0) {
138		$total_files_num--;
139		next;
140	    } else {
141		$docid_count += $num;
142		$file_count++;
143	    }
144
145	    $total_files_size     += $cfile_size;
146	    $processed_files_size += $cfile_size;
147	    last if $ReceiveTERM;
148	    if ($processed_files_size > $conf::ON_MEMORY_MAX) {
149		if (%KeyIndex) {
150		    $key_count = write_index();
151		    print _("Writing index files...");
152		    write_phrase_hash();
153		    print "\n";
154		}
155		$processed_files_size = 0;
156		$checkpoint = 1, last if $CheckPoint{'on'} && defined(<$fh_flist>);
157	    }
158	}
159
160        util::fclose($fh_flist);
161        util::fclose($fh_errorsfile);
162    }
163    # This should be out of above blocks because of file handler closing.
164    re_exec($flist_ptr, $docid_count, $docid_base, $start_time,
165	    $total_files_size, $total_files_num,
166	    $file_count, $key_count) if $checkpoint;
167
168    if (%KeyIndex) {
169	$key_count = write_index();
170	print _("Writing index files...");
171	write_phrase_hash();
172	print "\n";
173    }
174
175    $key_count = get_total_keys() unless $key_count;
176    do_remain_job($total_files_size, $docid_count, $key_count,
177		   $start_time);
178    exit 0;
179}
180
181#
182# FIXME: Very complicated.
183#
184sub process_file ($$$$\%$$) {
185    my ($cfile, $docid_count, $docid_base, $file_count,
186	$field_indices, $fh_errorsfile, $total_files_num) = @_;
187
188    my $processed_num = 0;
189    my $file_size = util::filesize($cfile);
190
191    if ($var::Opt{'htmlsplit'} && $cfile =~ $conf::HTML_SUFFIX) {
192	my @parts;
193	@parts = htmlsplit::split($cfile, "NMZ.partial")
194            if ($file_size <= $conf::FILE_SIZE_MAX);
195	if (@parts > 1) {
196	    my $id = 0;
197	    for my $part (@parts) {
198		next if (defined $conf::EXCLUDE_PATH &&
199			 "$cfile#$part" =~ /$conf::EXCLUDE_PATH/);
200		my $fname = util::tmpnam("NMZ.partial.$id");
201		my $fragment  = defined $part ? $part : undef;
202		my $uri   = generate_uri($cfile, $fragment);
203		my $result = namazu_core($fname,
204					 $docid_count + $processed_num,
205					 $docid_base, $file_count,
206					 $field_indices, $fh_errorsfile,
207					 $total_files_num,
208					 $uri, $id, $#parts);
209		if ($result > 0) {
210		    $processed_num++;
211		    my $rname = defined $part ? "$cfile\t$part" : "$cfile";
212		    put_registry($rname);
213		}
214	    	unlink $fname;
215		$id++;
216	    }
217	    return ($file_size, $processed_num);
218	}
219    }
220    my $result = namazu_core($cfile, $docid_count, $docid_base,
221			     $file_count, $field_indices,
222			     $fh_errorsfile, $total_files_num,
223			     undef, undef, undef);
224    if ($result > 0) {
225	$processed_num++;
226	put_registry($cfile);
227    }
228
229    return ($file_size, $processed_num);
230}
231
232#
233# Load mknmzrcs:
234#
235#  1. MKNMZRC environment
236#
237#  2. $(sysconfdir)/$(PACKAGE)/mknmzrc
238#
239#  3. ~/.mknmzrc
240#
241#  4. user-specified mknmzrc set by mknmz --config=file option.
242#
243# If multiple files exists, read all of them.
244#
245sub load_rcfiles () {
246    my (@cand) = ();
247
248    # To support Windows. Since they have nasty drive letter convention,
249    # it is necessary to change mknmzrc dynamically with env. variable.
250    push @cand, $ENV{'MKNMZRC'} if defined $ENV{'MKNMZRC'};
251    push @cand, "$CONFDIR/mknmzrc";
252    push @cand, "$ENV{'HOME'}/.mknmzrc";
253
254    util::vprint(_("Reading rcfile: "));
255    for my $rcfile (@cand) {
256	if (-f $rcfile) {
257		load_rcfile ($rcfile);
258	    util::vprint(" $rcfile");
259	}
260    }
261    util::vprint("\n");
262}
263
264sub load_rcfile ($) {
265    my ($rcfile) = @_;
266    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
267	util::win32_yen_to_slash(\$rcfile);
268    }
269    return if (grep {m/^$rcfile$/} @LoadedRcfiles);
270    do $rcfile;
271    if ($@) {
272        chop $@;
273        push @LoadedRcfiles, "load failed " .$rcfile . "\'$@\'";
274    }else {
275        push @LoadedRcfiles, $rcfile;
276    }
277
278    # Dirty workaround.
279    $LIBDIR = $conf::LIBDIR
280	if (defined $conf::LIBDIR && -d $conf::LIBDIR);
281    $FILTERDIR = $conf::FILTERDIR
282	if (defined $conf::FILTERDIR && -d $conf::FILTERDIR);
283    $TEMPLATEDIR = $conf::TEMPLATEDIR
284	if (defined $conf::TEMPLATEDIR && -d $conf::TEMPLATEDIR);
285}
286
287sub re_exec($$$$$$$$) {
288    my ($flist_ptr, $docid_count, $docid_base, $start_time,
289	$total_files_size, $total_files_num, $file_count, $key_count) = @_;
290
291    # store variables
292    {
293	my $fh_checkpoint = util::efopen(">$var::NMZ{'_checkpoint'}");
294
295	print $fh_checkpoint <<EOM;
296	\$DeletedFilesCount = $DeletedFilesCount;
297	\$UpdatedFilesCount = $UpdatedFilesCount;
298	\$APPENDMODE = $APPENDMODE;
299	\$flist_ptr = $flist_ptr;
300	\$docid_count = $docid_count;
301	\$docid_base = $docid_base;
302	\$start_time = $start_time;
303	\$total_files_size = $total_files_size;
304	\$total_files_num = $total_files_num;
305	\$key_count = $key_count;
306	\$file_count = $file_count;
307	\$\$ = $$;
308EOM
309        util::fclose($fh_checkpoint);
310    }
311
312    @ARGV = ("-S", @ARGV) ;
313    print _("Checkpoint reached: re-exec mknmz...\n");
314    util::dprint(join ' ', ("::::", @ARGV, "\n"));
315    exec ($0, @ARGV) ;
316}
317
318sub put_registry ($) {
319    my ($filename) = @_;
320    my $fh_registry = util::efopen(">>$var::NMZ{'_r'}");
321    print $fh_registry $filename, "\n";
322    util::fclose($fh_registry);
323}
324
325
326# Initialization
327#   $CodingSystem: Character Coding System 'euc' or 'sjis'
328sub init () {
329    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
330	$CodingSystem = "sjis";
331	if ($CONFDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
332	    $CONFDIR = $1 . $CONFDIR ;
333	}
334	if ($LIBDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
335	    $LIBDIR = $1 . $LIBDIR ;
336	}
337	if ($FILTERDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
338	    $FILTERDIR = $1 . $FILTERDIR ;
339	}
340	if ($TEMPLATEDIR !~ /^[A-Z]:|^\\\\/i && $0 =~ m#^([A-Z]:)(/|\\)#i) {
341	    $TEMPLATEDIR = $1 . $TEMPLATEDIR ;
342	}
343    } else {
344	$CodingSystem = "euc";
345    }
346
347    $SIG{'INT'}  = sub {
348	util::cdie("SIGINT caught! Aborted.\n");
349    };
350
351    $SIG{'TERM'}  = sub {
352	print STDERR "SIGTERM caught!\n";
353	$ReceiveTERM = 1;
354    };
355}
356
357sub preload_modules () {
358    unshift @INC, $LIBDIR;
359    # workaround for test suites.
360    unshift @INC, $ENV{'top_builddir'} . "/pl" if defined $ENV{'top_builddir'};
361
362    require "var.pl" || die "unable to require \"var.pl\"\n";
363    require "conf.pl" || die "unable to require \"conf.pl\"\n";
364    require "util.pl" || die "unable to require \"util.pl\"\n";
365    require "gettext.pl" || die "unable to require \"gettext.pl\"\n";
366    require "ext.pl" || die "unable to require \"ext.pl\"\n";
367}
368
369sub postload_modules () {
370    require "htmlsplit.pl" || die "unable to require \"htmlsplit.pl\"\n";
371}
372
373sub load_modules () {
374    require "usage.pl" || die "unable to require \"usage.pl\"\n";
375    require "codeconv.pl" || die "unable to require \"codeconv.pl\"\n";
376    require "wakati.pl" || die "unable to require \"wakati.pl\"\n";
377    require "seed.pl" || die "unable to require \"seed.pl\"\n";
378    require "gfilter.pl" || die "unable to require \"gfilter.pl\"\n";
379
380    @Seed = seed::init();
381}
382
383sub load_filtermodules () {
384    unshift @INC, $FILTERDIR;
385
386    #
387    # Windows modules must be loaded first.
388    # Because OLE filters have low precedence over normal ones.
389    #
390    load_win32modules() if $English::OSNAME eq "MSWin32";
391
392    # Check filter modules
393    my @filters = ();
394    @filters = glob "$FILTERDIR/*.pl";
395
396    load_filters(@filters);
397}
398
399sub load_win32modules () {
400    # Check filter modules
401    my @filters = ();
402    if (-f "../filter/win32/olemsword.pl") { # to ease developing
403	@filters = glob "../filter/win32/*.pl";
404	unshift @INC, "../filter/win32";
405    } else {
406	@filters = glob "$FILTERDIR/win32/*.pl";
407	unshift @INC, "$FILTERDIR/win32";
408    }
409
410    load_filters(@filters);
411}
412
413sub load_filters (@) {
414    my @filters = @_;
415
416    for my $filter (@filters) {
417	$filter =~ m!([-\w]+)\.pl$!;
418	my $module = $1;
419	require "$module.pl" || die "unable to require \"$module.pl\"\n";;
420	my (@mtypes, $status, $recursive, $pre_codeconv, $post_codeconv);
421
422	eval "\@mtypes =    ${module}::mediatype();";
423	die $@ if $@;  # eval error
424	eval "\$status =    ${module}::status();";
425	die $@ if $@;
426	eval "\$recursive = ${module}::recursive();";
427	die $@ if $@;
428	eval "\$pre_codeconv  = ${module}::pre_codeconv();";
429	die $@ if $@;
430	eval "\$post_codeconv  = ${module}::post_codeconv();";
431	die $@ if $@;
432	eval "${module}::add_magic(\$Magic);";
433	die $@ if $@;
434
435	for my $mt (@mtypes) {
436        next if (defined $var::Supported{$mt} &&
437                 $var::Supported{$mt} eq 'yes' && $status eq 'no');
438	    $var::Supported{$mt} = $status;
439	    $var::REQUIRE_ACTIONS{$mt} = $module;
440	    $var::RECURSIVE_ACTIONS{$mt} = $recursive;
441	    $var::REQUIRE_PRE_CODECONV{$mt} = $pre_codeconv;
442	    $var::REQUIRE_POST_CODECONV{$mt} = $post_codeconv;
443	}
444    }
445}
446
447# Core routine.
448#
449# FIXME: Too many parameters. They must be cleared.
450#
451sub namazu_core ($$$$$$$$$$) {
452    my ($cfile, $docid_count, $docid_base,
453	$file_count, $field_indices, $fh_errorsfile, $total_files_num,
454	$uri, $part_id, $part_num) = @_;
455
456    my $headings = "";
457    my $content = "";
458    my $weighted_str = "";
459    my %fields;
460    my $msg_prefix;
461
462    if ($part_id) {
463	$msg_prefix = "    $part_id/$part_num - ";
464    } else {
465	$msg_prefix = $file_count + 1 . "/$total_files_num - ";
466    }
467
468    unless ($uri) {
469	$uri = generate_uri($cfile);  # Make a URI from a file name.
470    }
471    my ($cfile_size, $text_size, $kanji, $mtype) =
472	load_document(\$cfile, \$content, \$weighted_str,
473		      \$headings, \%fields);
474
475    {
476        $fields{'mtime'} = (stat($cfile))[9];
477        my $utc = $fields{'mtime'};
478        $utc = time::rfc822time_to_mtime($fields{'date'})
479                if (defined $fields{'date'});
480        if ($utc == -1) {
481            my $date = $fields{'date'};
482            print "$cfile Illegal date format. : $date\n";
483            print $fh_errorsfile "$cfile Illegal date format. : $date\n";
484            $utc = $fields{'mtime'};
485            delete $fields{'date'};
486        }
487        $fields{'utc'} = $utc;
488    }
489
490    util::dprint(_("after load_document: ")."$uri: $cfile_size, $text_size, $kanji, $mtype\n");
491
492    # Check if the file is acceptable.
493    my $err = check_file($cfile, $cfile_size, $text_size, $mtype, $uri);
494    if (defined $err) {
495	if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
496	    my $uri2 = codeconv::eucjp_to_shiftjis($uri);
497	    print $msg_prefix . "$uri2 $err\n";
498	} else {
499	    print $msg_prefix . "$uri $err\n";
500	}
501	print $fh_errorsfile "$cfile $err\n";
502	return 0;  # return 0 if error
503    }
504
505    # Print processing file name as URI.
506    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
507	my $uri2 = codeconv::eucjp_to_shiftjis($uri);
508	print $msg_prefix . "$uri2 [$mtype]\n";
509    } else {
510	print $msg_prefix . "$uri [$mtype]\n";
511    }
512
513    # Add filename.
514    my $filename = defined $cfile ? $cfile : '';
515    codeconv::toeuc(\$filename);
516    $filename = basename($filename);
517    $fields{'filename'} = $filename;
518
519    complete_field_info(\%fields, $cfile, $uri,
520			\$headings, \$content, \$weighted_str);
521    put_field_index(\%fields, $field_indices);
522
523    put_dateindex($cfile);
524    $content .= "\n\n$filename\n\n";    # add filename
525    $content .= $weighted_str;          # add weights
526    count_words($docid_count, $docid_base, \$content, $kanji);
527    make_phrase_hash($docid_count, $docid_base, \$content);
528
529    # assertion
530    util::assert($cfile_size != 0,
531		 "cfile_size == 0 at the end of namazu_core.");
532
533    return $cfile_size;
534}
535
536#
537# Make the URI from the given file name.
538#
539sub generate_uri (@) {
540    my ($file, $fragment) = @_;
541    return "" unless defined $file;
542
543    # omit a file name if omittable
544    $file =~ s!^(.*)/($conf::DIRECTORY_INDEX)$!$1/!o;
545
546    if (defined $ReplaceCode) {
547	# transforming URI by evaling
548	$_ = $file;
549	eval $ReplaceCode;
550	$file = $_;
551    }
552
553    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
554	$file =~ s#^([A-Z]):#/$1|#i; # converting a drive part like: /C|
555    }
556
557    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
558	$file = codeconv::shiftjis_to_eucjp($file);
559    }
560    if (defined $fragment) {
561        codeconv::toeuc(\$fragment);
562    }
563
564    unless ($var::Opt{'noencodeuri'}) {
565	for my $tmp ($file, $fragment) {
566	    next unless defined $tmp;
567
568	    # Escape unsafe characters (not strict)
569	    $tmp =~ s/\%/%25/g;  # Convert original '%' into '%25' v1.1.1.2
570	    $tmp =~ s/([^a-zA-Z0-9~\-\_\.\/\:\%])/
571		sprintf("%%%02X",ord($1))/ge;
572	}
573    }
574
575
576    my $uri = $file;
577    $uri .= "#" . $fragment if defined $fragment;
578    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
579        # restore '|' for drive letter rule of Win32, OS/2
580        $uri =~ s!^/([A-Z])%7C!/$1|!i;
581    }
582    return $uri;
583}
584
585
586sub get_field_index_base (\%) {
587    my ($field_indices) = @_;
588
589    my @keys = split('\|', $conf::SEARCH_FIELD);
590    if ($var::Opt{'meta'}) {
591	push @keys, (split '\|', $conf::META_TAGS);
592    }
593    for my $key (@keys) {
594	$key = lc($key);
595	my $fname    = "$var::NMZ{'field'}.$key";
596	my $tmp_fname = util::tmpnam("NMZ.field.$key");
597	my $size = 0;
598	$size = -s $fname if -f $fname;
599	$size += -s $tmp_fname if -f $tmp_fname;
600	$field_indices->{$key} = $size;
601    }
602}
603
604sub complete_field_info (\%$$\$\$\$) {
605    my ($fields, $cfile, $uri, $headings, $contref, $wsref) = @_;
606
607    for my $field (keys %{$fields}) {
608        if (!defined($fields->{$field}) or $fields->{$field} =~ /^\s*$/) {
609            delete $fields->{$field};
610        }
611    }
612
613    unless (defined($fields->{'title'})) {
614	$fields->{'title'} = gfilter::filename_to_title($cfile, $wsref);
615    }
616    unless (defined($fields->{'date'})) {
617	my $mtime = $fields->{'mtime'};
618	my $date = util::rfc822time($mtime);
619	$fields->{'date'} = $date;
620    }
621    unless (defined($fields->{'uri'})) {
622	$fields->{'uri'} = $uri;
623    }
624    unless (defined($fields->{'size'})) {
625	$fields->{'size'} = -s $cfile;
626    }
627    unless (defined($fields->{'summary'})) {
628	$fields->{'summary'} = make_summary($contref, $headings, $cfile);
629    }
630    unless (defined($fields->{'from'}) || defined($fields->{'author'})) {
631	$fields->{'from'} = getmsg("unknown");
632    }
633}
634
635#
636# Currently, messages for NMZ.* files should be encoded in
637# EUC-JP currently. We cannot use gettext.pl for the messsage
638# because gettext.pl may use Shift_JIS encoded messages.
639# So, we should use the function instead of gettext().
640#
641# FIXME: Ad hoc impl.  getmsg() is effective only for "unknown".
642#
643sub getmsg($) {
644    my ($msg) = @_;
645
646    if (util::islang_msg("ja")) {
647	if ($msg eq "unknown") {
648	    return "����";
649	}
650    }
651    return $msg;
652}
653
654sub make_summary ($$$) {
655    my ($contref, $headings, $cfile) = @_;
656
657    # pick up $conf::MAX_FIELD_LENGTH bytes string
658    my $tmp = "";
659    if ($$headings ne "") {
660	$$headings =~ s/^\s+//;
661	$$headings =~ s/\s+/ /g;
662	$tmp = $$headings;
663    } else {
664	$tmp = "";
665    }
666
667    my $offset = 0;
668    my $tmplen = 0;
669    while (($tmplen = $conf::MAX_FIELD_LENGTH + 1 - length($tmp)) > 0
670           && $offset < length($$contref))
671    {
672        $tmp .= substr $$contref, $offset, $tmplen;
673        $offset += $tmplen;
674        $tmp =~ s/(([\xa1-\xfe]).)/$2 eq "\xa8" ? '': $1/ge;
675        $tmp =~ s/([-=*\#])\1{2,}/$1$1/g;
676    }
677
678    # -1 means "LF"
679    my $summary = substr $tmp, 0, $conf::MAX_FIELD_LENGTH - 1;
680    # Remove a garbage Kanji 1st char at the end.
681    $summary = codeconv::chomp_eucjp($summary);
682
683    $summary =~ s/^\s+//;
684    $summary =~ s/\s+/ /g;   # normalize white spaces
685
686    return $summary;
687}
688
689
690# output the field infomation into NMZ.fields.* files
691sub put_field_index (\%$) {
692    my ($fields, $field_indices) = @_;
693
694    my $aliases_regex =
695	join('|', sort {length($b) <=> length($a)} keys %conf::FIELD_ALIASES);
696
697    for my $field (keys %{$fields}) {
698        util::dprint("Field: $field: $fields->{$field}\n");
699	if ($field =~ /^($aliases_regex)$/o) {
700	    unless (defined($fields->{$conf::FIELD_ALIASES{$field}})) {
701		$fields->{$conf::FIELD_ALIASES{$field}} = $fields->{$field};
702	    }
703	    undef $fields->{$field};
704	}
705    }
706
707    my @keys = split '\|', $conf::SEARCH_FIELD;
708    if ($var::Opt{'meta'}) {
709        my @meta = split '\|', $conf::META_TAGS;
710        while (my $meta = shift(@meta)) {
711            $meta = $conf::FIELD_ALIASES{$meta}
712                if (defined $conf::FIELD_ALIASES{$meta});
713
714            push @keys, $meta;
715        }
716
717        # uniq @keys
718        my %mark = ();
719        @keys = grep {$mark{$_}++; $mark{$_} == 1} @keys;
720    }
721    for my $key (@keys) {
722	my $lkey = lc($key);
723	my $fname    = util::tmpnam("NMZ.field.$lkey");
724	my $fh_field = util::efopen(">>$fname");
725	my $output = "";
726	if (defined($fields->{$key})) {
727	    if ($key ne 'uri') { # workaround for namazu-bugs-ja#30
728		$fields->{$key} =~ s/\s+/ /g;
729		$fields->{$key} =~ s/\s+$//;
730		$fields->{$key} =~ s/^\s+//;
731	    }
732	    $output = $fields->{$key};
733
734	    # -1 means "LF"
735	    $output = substr $output, 0, $conf::MAX_FIELD_LENGTH - 1;
736	    # Remove a garbage Kanji 1st char at the end.
737	    $output = codeconv::chomp_eucjp($output);
738
739            $output =~ s/\n.*$//s;
740            $output .= "\n";
741	} else {
742	    $output = "\n";
743	}
744	print $fh_field $output;
745        util::fclose($fh_field);
746
747	# put index of field index
748	{
749	    my $fname        = util::tmpnam("NMZ.field.$lkey.i");
750	    my $fh_field_idx = util::efopen(">>$fname");
751	    print $fh_field_idx pack("N", $field_indices->{$lkey});
752	    $field_indices->{$lkey} += length $output;
753            util::fclose($fh_field_idx);
754	}
755    }
756
757}
758
759# put the date infomation into NMZ.t file
760sub put_dateindex ($) {
761    my ($cfile) = @_;
762    my $mtime = (stat($cfile))[9];
763
764    my $fh_dataindex = util::efopen(">>$var::NMZ{'_t'}");
765    print $fh_dataindex pack("N", $mtime);
766    util::fclose($fh_dataindex);
767}
768
769
770# load a document file
771sub load_document ($$$$\%) {
772    my ($orig_cfile, $contref, $weighted_str, $headings, $fields)
773      = @_;
774    my $cfile = $$orig_cfile;
775
776    return (0, 0, 0, 0) unless (-f $cfile && util::canopen($cfile));
777
778    # for handling a filename which contains Shift_JIS code for Windows.
779    # for handling a filename which contains including space.
780    my $shelter_cfile = "";
781    if (($cfile =~ /\s/) ||
782        ($English::OSNAME eq "MSWin32"
783        && $cfile =~ /[\x81-\x9f\xe0-\xef][\x40-\x7e\x80-\xfc]|[\x20\xa1-\xdf]/) )
784    {
785	$shelter_cfile = $cfile;
786	$cfile = util::tmpnam("NMZ.win32");
787	unlink $cfile if (-e $cfile);
788	copy($shelter_cfile, $cfile);
789    }
790
791    my $file_size;
792    $file_size = util::filesize($cfile); # not only file in feature.
793    if ($file_size > $conf::FILE_SIZE_MAX) {
794	return ($file_size, $file_size, 0, 'x-system/x-error; x-error=file_size_max');
795    }
796
797    $$contref = util::readfile($cfile);
798#    $file_size = length($$contref);
799
800    my ($kanji, $mtype) = apply_filter($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, undef);
801
802    if ($English::OSNAME eq "MSWin32" && $shelter_cfile ne "") {
803	unlink $cfile;
804	$cfile = $shelter_cfile;
805    }
806
807    # Measure the text size at this time.
808    my $text_size = length($$contref) + length($$weighted_str);
809
810    return ($file_size, $text_size, $kanji, $mtype);
811}
812
813sub apply_filter($$$$$$$) {
814    my ($orig_cfile, $contref, $weighted_str, $headings, $fields, $shelter_cfile, $mmtype)
815      = @_;
816    my $cfile = $shelter_cfile ne "" ? $shelter_cfile : $$orig_cfile;
817
818    # Filtering process.
819    my $mtype;
820    my $called_dt = 0;
821    while (1) {
822	if (defined $MediaType) {
823	    $mtype = $MediaType;
824	} elsif (defined $mmtype) {
825	    $mtype = $mmtype;
826	} else {
827	    my $mtype_n = $Magic->checktype_byfilename($cfile);
828	    my $mtype_c = $Magic->checktype_data($$contref);
829	    my $mtype_m;
830	    $mtype_m = $Magic->checktype_magic($$contref)
831	      if ((! defined $mtype_c) ||
832		  $mtype_c =~
833		  /^(text\/html|text\/plain|application\/octet-stream)$/);
834	    $mtype_c = $mtype_m
835		if (defined $mtype_m &&
836		    $mtype_m !~
837		    /^(text\/html|text\/plain|application\/octet-stream)$/);
838	    $mtype_c = 'text/plain' unless defined $mtype_c;
839	    if ($called_dt) {
840		$mtype = $mtype_c;
841	    } else {
842		$mtype = decide_type($mtype_n, $mtype_c);
843		$called_dt = 1;
844	    }
845	}
846	util::dprint(_("Detected type: ")."$mtype\n");
847
848	# Pre code conversion.
849	if ($var::REQUIRE_PRE_CODECONV{$mtype}) {
850	    util::dprint("pre_codeconv\n");
851	    codeconv_document($contref);
852	}
853
854	if (! $var::Supported{$mtype} ||
855	    $var::Supported{$mtype} ne 'yes')
856	{
857	    util::vprint(_("Unsupported media type ")."$mtype\n");
858	    return (0, "$mtype; x-system=unsupported");
859	}
860
861	if ($var::REQUIRE_ACTIONS{$mtype}) {
862	    util::vprint(_("Using ")."$var::REQUIRE_ACTIONS{$mtype}.pl\n");
863	    require $var::REQUIRE_ACTIONS{$mtype}.'.pl'
864	        || die _("unable to require ") .
865		    "\"$var::REQUIRE_ACTIONS{$mtype}.pl\"\n";
866	    my $err = undef;
867	    {
868		local $SIG{'PIPE'} = \&trapintr;
869		eval '$err = ' . $var::REQUIRE_ACTIONS{$mtype} .
870		  '::filter($orig_cfile, $contref, $weighted_str, $headings, $fields);';
871	    }
872	    if ($err) {
873		if ($err =~ m/; x-system=unsupported$/) {
874		    return (0, $err);
875		}
876		return (0, "$mtype; x-error=$err");
877	    }
878
879	    if ($@) {
880		util::vprint(_("Failed to call ")."$var::REQUIRE_ACTIONS{$mtype}\n$@\n");
881		return (0, "$mtype; x-error=$@");
882	    }
883
884	    # Post code conversion.
885	    if ($var::REQUIRE_POST_CODECONV{$mtype}) {
886	        util::dprint("post_codeconv\n");
887		codeconv_document($contref);
888	    }
889
890	    next if ($var::RECURSIVE_ACTIONS{$mtype});
891	}
892	last;
893    }
894
895    my $kanji = $$contref =~ tr/\xa1-\xfe/\xa1-\xfe/;  # Kanji contained?
896    $kanji += $$weighted_str =~ tr/\xa1-\xfe/\xa1-\xfe/;
897
898    return ($kanji, $mtype);
899}
900
901sub codeconv_document ($) {
902    my ($textref) = @_;
903    codeconv::toeuc($textref);
904    $$textref =~ s/\r\n/\n/g;
905    $$textref =~ s/\r/\n/g;
906    $$textref =~ tr/\x01-\x08\x0b-\x0c\x0e-\x1f\x7f/ /; # Remove control char.
907}
908
909sub prep () {
910    my $docid_base = 0;
911    my $output_dir = shift @_ ;
912    my @targets = @_ ;
913    my @flist = ();
914
915    $var::OUTPUT_DIR = $output_dir;
916
917    require_modules();
918    change_filenames();
919    check_present_index();
920
921    # if Checkpoint mode, return
922    return (0, 0) if $CheckPoint{'continue'};
923
924    check_lockfile($var::NMZ{'lock2'});
925    print _("Looking for indexing files...\n");
926    @flist = find_target(@targets);
927    ($docid_base, @flist) = append_index(@flist)
928	if -f $var::NMZ{'r'};
929    unless (@flist) { # if @flist is empty
930	print _("No files to index.\n");
931	exit 0;
932    }
933    set_lockfile($var::NMZ{'lock2'});
934    save_flist(@flist);
935    my $total_files_num = @flist;
936
937    return ($docid_base, $total_files_num);
938}
939
940sub save_flist(@) {
941    my @flist = @_;
942    return if (@flist == 0);
943
944    my $fh_flist = util::efopen(">$var::NMZ{'_flist'}");
945    print $fh_flist join("\n", @flist), "\n";
946    util::fclose($fh_flist);
947}
948
949sub require_modules() {
950    if (util::islang("ja") && $conf::NKF =~ /^module_nkf/) {
951	require NKF || die "unable to require \"NKF\"\n";
952        util::dprint(_("code conversion: using NKF module\n"));
953	$var::USE_NKF_MODULE = 1;
954    }
955    if (util::islang("ja") && $conf::WAKATI =~ /^module_kakasi/) {
956	require Text::Kakasi || die "unable to require \"Text::Kakasi\"\n";
957        util::dprint(_("wakati: using Text::Kakasi module\n"));
958	my $res = Text::Kakasi::getopt_argv('kakasi', '-ieuc', '-oeuc', '-w');
959    }
960    if (util::islang("ja") && $conf::WAKATI =~ /^module_chasen/) {
961	require Text::ChaSen || die "unable to require \"Text::ChaSen\"\n";
962        util::dprint(_("wakati: using Text::ChaSen module\n"));
963	my @arg = ('-i', 'e', '-j', '-F', '%m ');
964	@arg    = ('-i', 'e', '-j', '-F', '%m %H\\n') if $var::Opt{'noun'};
965	my $res = Text::ChaSen::getopt_argv('chasen-perl', @arg);
966    }
967    if (util::islang("ja") && $conf::WAKATI =~ /^module_mecab/) {
968        require MeCab || die "unable to require \"MeCab\"\n";
969        util::dprint(_("wakati: using MeCab module\n"));
970    }
971}
972
973sub check_lockfile ($) {
974    # warn if check file exists in case other process is running or abnormal
975    # stop execution (later is not the major purpose, though).
976    # This is mainly for early detection before longish find_target.
977    my ($file) = @_;
978
979    if (-f $file) {
980	print "$file "._("found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n");
981	exit 1;
982    }
983}
984
985sub set_lockfile ($) {
986    my ($file) = @_;
987
988    # make a lock file
989    if (-f $file) {
990	print "$file found. Maybe this index is being updated by another process now.\nIf not, you can remove this file.\n";
991	exit 1;
992    } else {
993	my $fh_lockfile = util::efopen(">$file");
994	print $fh_lockfile "$$"; # save pid
995        util::fclose($fh_lockfile);
996    }
997}
998
999sub remove_lockfile ($) {
1000    my ($file) = @_;
1001
1002    # remove lock file
1003    unlink $file if -f $file;
1004}
1005
1006# check present index whether it is old type of not
1007sub check_present_index () {
1008    if (-f $var::NMZ{'i'} && ! -f "$var::NMZ{'wi'}")
1009    {
1010	util::cdie(_("Present index is old type. it's unsupported.\n"));
1011    }
1012}
1013
1014# remain
1015sub do_remain_job ($$$$) {
1016    my ($total_files_size, $docid_count, $key_count, $start_time) = @_;
1017
1018    if ($docid_count == 0) {
1019	# No files are indexed
1020	if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1021	    update_dateindex();
1022	    update_registry($docid_count);
1023	}
1024    } else {
1025	set_lockfile($var::NMZ{'lock'});
1026	write_version();
1027	write_body_msg();
1028	write_tips_msg();
1029	write_result_file();
1030	update_field_index();
1031	update_dateindex();
1032	update_registry($docid_count);
1033	write_nmz_files();
1034	make_slog_file();
1035	remove_lockfile($var::NMZ{'lock'});
1036    }
1037    make_headfoot_pages($docid_count, $key_count);
1038    put_log($total_files_size, $start_time, $docid_count, $key_count);
1039    util::remove_tmpfiles();
1040    unlink $var::NMZ{'_flist'};
1041}
1042
1043sub make_headfoot_pages($$) {
1044    my ($docid_count, $key_count) = @_;
1045
1046    for my $file (glob "$TEMPLATEDIR/NMZ.head*") {
1047        if ($file =~ m!^.*/NMZ\.head(\.[-\w\.]+)?$!){
1048            my $suffix = $1 ? $1 : '';
1049            make_headfoot("$var::NMZ{'head'}${suffix}", $docid_count, $key_count);
1050        }
1051    }
1052    for my $file (glob "$TEMPLATEDIR/NMZ.foot*") {
1053        if ($file =~ m!^.*/NMZ\.foot(\.[-\w\.]+)?$!){
1054            my $suffix = $1 ? $1 : '';
1055            make_headfoot("$var::NMZ{'foot'}${suffix}", $docid_count, $key_count);
1056       }
1057    }
1058}
1059
1060# Parse command line options.
1061sub parse_options
1062{
1063    if (@ARGV == 0) {
1064	show_mini_usage();
1065	exit 1;
1066    }
1067
1068    my @targets = ();
1069    my $targets_loaded = 0;
1070    my @argv = @ARGV;
1071    my $cwd = cwd();
1072
1073    my $opt_dummy = 0;
1074    my $opt_version = 0;
1075    my $opt_help = 0;
1076    my $opt_all = 0;
1077    my $opt_chasen = 0;
1078    my $opt_chasen_noun = 0;
1079    my $opt_kakasi = 0;
1080    my $opt_mecab = 0;
1081    my $opt_checkpoint_sub = 0;
1082    my $opt_show_config = 0;
1083    my $opt_mailnews = 0;
1084    my $opt_mhonarc = 0;
1085    my $opt_norc = 0;
1086
1087    my $opt_quiet = undef;
1088    my $opt_config = undef;
1089    my $output_dir = undef;
1090    my $update_index = undef;
1091    my $include_file = undef;
1092    my $target_list = undef;
1093    my $index_lang = undef;
1094
1095    my %opt_conf;
1096
1097#    Getopt::Long::Configure('bundling');
1098    Getopt::Long::config('bundling');
1099    GetOptions(
1100       	       '0|help'              => \$opt_help,
1101	       '1|exclude=s'         => \$opt_conf{'EXCLUDE_PATH'},
1102	       '2|deny=s'            => \$opt_conf{'DENY_FILE'},
1103	       '3|allow=s'           => \$opt_conf{'ALLOW_FILE'},
1104	       '4|update=s'          => \$update_index,
1105	       '5|mhonarc'           => \$opt_mhonarc,
1106	       '6|mtime=s'           => \$var::Opt{'mtime'},
1107	       '7|html-split'        => \$var::Opt{'htmlsplit'},
1108	       'C|show-config'       => \$opt_show_config,
1109	       'E|no-edge-symbol'    => \$var::Opt{'noedgesymbol'},
1110	       'F|target-list=s'     => \$target_list,
1111	       'G|no-okurigana'      => \$var::Opt{'okurigana'},
1112	       'H|no-hiragana'       => \$var::Opt{'hiragana'},
1113	       'I|include=s'         => \$include_file,
1114	       'K|no-symbol'         => \$var::Opt{'nosymbol'},
1115	       'L|indexing-lang=s'     => \$index_lang,
1116	       'M|meta'              => \$var::Opt{'meta'},
1117	       'O|output-dir=s'      => \$output_dir,
1118	       'S|checkpoint-sub'    => \$opt_checkpoint_sub,
1119	       'T|template-dir=s'    => \$TEMPLATEDIR,
1120	       'U|no-encode-uri'     => \$var::Opt{'noencodeuri'} ,
1121	       'V|verbose'           => \$var::Opt{'verbose'},
1122	       'Y|no-delete'         => \$var::Opt{'nodelete'},
1123	       'Z|no-update'         => \$var::Opt{'noupdate'},
1124	       'a|all'               => \$opt_all,
1125               'b|use-mecab'         => \$opt_mecab,
1126	       'c|use-chasen'        => \$opt_chasen,
1127	       'd|debug'             => \$var::Opt{'debug'},
1128	       'e|robots'            => \$var::Opt{'robotexclude'},
1129	       'f|config=s'          => \$opt_config,
1130	       'h|mailnews'          => \$opt_mailnews,
1131	       'k|use-kakasi'        => \$opt_kakasi,
1132	       'm|use-chasen-noun'   => \$opt_chasen_noun,
1133	       'q|quiet'             => \$opt_quiet,
1134	       'r|replace=s'         => \$ReplaceCode,
1135	       's|checkpoint'        => \$CheckPoint{'on'},
1136	       't|media-type=s'      => \$MediaType,
1137	       'u|uuencode'          => \$opt_dummy, # for backward compat.
1138	       'v|version'           => \$opt_version,
1139	       'x|no-heading-summary'=> \$var::Opt{'noheadabst'},
1140	       'z|check-filesize'    => \$var::Opt{'checkfilesize'},
1141	       'decode-base64'       => \$var::Opt{'decodebase64'},
1142	       'norc'                => \$opt_norc,
1143	       );
1144
1145    if ($opt_quiet) {
1146	# Make STDOUT quiet by redirecting STDOUT to null device.
1147	my $devnull = util::devnull();
1148	open(STDOUT, ">$devnull") || die "$devnull: $!";
1149    }
1150
1151    if (defined $update_index) {
1152	unless (-d $update_index) {
1153	    print _("No such index: "), "$update_index\n";
1154	    exit 1;
1155	}
1156
1157	my $orig_status = $var::NMZ{'status'};
1158	$var::NMZ{'status'} = "$update_index/$var::NMZ{'status'}";
1159
1160	my $argv = get_status("argv");
1161	if (!defined $argv) {
1162	    print _("No such index: "), "$update_index\n";
1163	    exit 1;
1164	}
1165	@ARGV = split /\t/, $argv;
1166	util::dprint(_("Inherited argv: ")."@ARGV\n");
1167
1168	my $cwd  = get_status("cwd");
1169	if (!defined $cwd) {
1170	    print _("No such index: "), "$update_index\n";
1171	    exit 1;
1172	}
1173	chdir $cwd;
1174	util::dprint(_("Inherited cwd: ")."$cwd\n");
1175
1176	($output_dir, @targets) = parse_options();
1177	$output_dir = $update_index;
1178	$var::NMZ{'status'} = $orig_status;  # See also change_filenames()
1179	return ($output_dir, @targets);
1180    }
1181
1182    if (!$opt_norc && !(defined $ENV{'MKNMZNORC'})){
1183        load_rcfiles();
1184    }
1185    if ($opt_config) {
1186        if (-f $opt_config) {
1187            util::vprint(_("Reading rcfile: "));
1188            load_rcfile($ConfigFile = $opt_config);
1189            util::vprint(" $opt_config\n");
1190        }
1191    }
1192
1193    if ($index_lang) {
1194	$util::LANG = $index_lang;
1195      util::dprint("Override indexing language: $util::LANG\n");
1196    }
1197
1198    if ($opt_help) {
1199	show_usage();
1200	exit 1;
1201    }
1202
1203    if ($opt_version) {
1204	show_version();
1205	exit 1;
1206    }
1207
1208    load_filtermodules(); # to make effect $opt_config, $index_lang.
1209    postload_modules();
1210
1211    foreach my $key (keys %opt_conf){
1212	if (defined ($opt_conf{$key})) {
1213	     ${*{$conf::{$key}}{SCALAR}} = $opt_conf{$key};
1214	}
1215    }
1216
1217    if ($opt_mailnews) {
1218	$MediaType = 'message/rfc822';
1219    }
1220    if ($opt_mhonarc) {
1221	$MediaType = 'text/html; x-type=mhonarc';
1222    }
1223    if ($opt_all) {
1224	$conf::ALLOW_FILE = ".*";
1225    }
1226    if ($opt_chasen) {
1227	$conf::WAKATI = $conf::CHASEN;
1228	$var::Opt{'noun'} = 0;
1229    }
1230    if ($opt_chasen_noun) {
1231	$conf::WAKATI = $conf::CHASEN_NOUN;
1232	$var::Opt{'noun'} = 1;
1233    }
1234    if ($opt_kakasi) {
1235	$conf::WAKATI = $conf::KAKASI;
1236	$var::Opt{'noun'} = 0;
1237    }
1238    if ($opt_mecab) {
1239        $conf::WAKATI = $conf::MECAB;
1240        $var::Opt{'noun'} = 0;
1241    }
1242    if ($include_file) {
1243	do $include_file;
1244        util::dprint("Included: $include_file\n");
1245    }
1246    if ($target_list) {
1247	if ($CheckPoint{'continue'}) {
1248	    @targets = ("dummy");
1249	} else {
1250	    @targets = load_target_list($target_list);
1251	    util::dprint(_("Loaded: ")."$target_list\n");
1252	}
1253	$targets_loaded = 1;
1254    }
1255    if ($opt_checkpoint_sub) {
1256	$CheckPoint{'on'}           = 1;
1257	$CheckPoint{'continue'}     = 1;
1258	@argv = grep {! /^-S$/} @argv;  # remove -S
1259    }
1260
1261    if (defined $ReplaceCode) {
1262	my $orig = "/foo/bar/baz/quux.html";
1263	$_ = $orig;
1264	eval $ReplaceCode;
1265	if ($@) {  # eval error
1266	    util::cdie(_("Invalid replace: ")."$ReplaceCode\n");
1267	}
1268	util::dprint(_("Replace: ")."$orig -> $_\n");
1269    }
1270
1271    if ($opt_show_config) {
1272	show_config();
1273	exit 1;
1274    }
1275
1276    if (@ARGV == 0 && $targets_loaded == 0) {
1277	show_mini_usage();
1278	exit 1;
1279    }
1280
1281    $output_dir = $cwd unless defined $output_dir;
1282    util::cdie("$output_dir: "._("invalid output directory\n"))
1283	unless (-d $output_dir && -w $output_dir);
1284
1285    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
1286	util::win32_yen_to_slash(\$output_dir);
1287    }
1288
1289    # take remaining @ARGV as targets
1290    if (@ARGV > 0 && $targets_loaded == 0) {
1291	@targets = @ARGV ;
1292    }
1293
1294    # revert @ARGV
1295    # unshift @ARGV, splice(@argv, 0, @argv - @ARGV);
1296    @ARGV = @argv;
1297
1298    return ($output_dir, @targets);
1299}
1300
1301sub show_config () {
1302    print _("Loaded rcfile: ") . "@LoadedRcfiles\n" if @LoadedRcfiles;
1303    print _("System: ") . "$English::OSNAME\n" if $English::OSNAME;
1304    print _("Namazu: ") . "$var::VERSION\n" if $var::VERSION;
1305    print _("Perl: ") . sprintf("%f\n", $English::PERL_VERSION);
1306    print _("File-MMagic: ") . "$File::MMagic::VERSION\n" if $File::MMagic::VERSION;
1307    print _("NKF: ") . "$conf::NKF\n" if $conf::NKF;
1308    print _("KAKASI: ") . "$conf::KAKASI\n" if $conf::KAKASI;
1309    print _("ChaSen: ") . "$conf::CHASEN\n" if $conf::CHASEN;
1310    print _("MeCab: ") . "$conf::MECAB\n" if $conf::MECAB;
1311    print _("Wakati: ") . "$conf::WAKATI\n" if $conf::WAKATI;
1312    print _("Lang_Msg: ") . "$util::LANG_MSG\n";
1313    print _("Lang: ") . "$util::LANG\n";
1314    print _("Coding System: ") . "$CodingSystem\n";
1315    print _("CONFDIR: ") . "$CONFDIR\n";
1316    print _("LIBDIR: ") . "$LIBDIR\n";
1317    print _("FILTERDIR: ") . "$FILTERDIR\n";
1318    print _("TEMPLATEDIR: ") . "$TEMPLATEDIR\n";
1319
1320    my @all_types =     keys %var::Supported;
1321    my @supported = sort grep { $var::Supported{$_} eq "yes" } @all_types;
1322
1323    my $num_supported = @supported;
1324    my $num_unsupported = @all_types - @supported;
1325    print _("Supported media types:   ") . "($num_supported)\n";
1326    print _("Unsupported media types: ") . "($num_unsupported) " . _("marked with minus (-) probably missing application in your \$path.\n");
1327    for my $mtype (sort keys %var::Supported) {
1328	my $yn = $var::Supported{$mtype};
1329	if ($yn eq 'yes') { $yn = ' ' } else {$yn = '-'};
1330	print "$yn $mtype";
1331	if ($var::REQUIRE_ACTIONS{$mtype}){
1332	    print ": $var::REQUIRE_ACTIONS{$mtype}.pl";
1333	}
1334	print "\n";
1335    }
1336}
1337
1338sub load_target_list ($) {
1339    my ($file) = @_;
1340    my $fh_targets = util::efopen($file);
1341    my @targets = <$fh_targets>;
1342    util::fclose($fh_targets);
1343    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1344	foreach my $tmp (@targets){
1345	    $tmp =~ s/\r//g;
1346	    util::win32_yen_to_slash(\$tmp);
1347	}
1348    }
1349    chomp @targets;
1350    return @targets;
1351}
1352
1353# convert a relative path into an absolute path
1354sub absolute_path($$) {
1355    my ($cwd, $path) = @_;
1356
1357    $path =~ s!^\.$!\./!;
1358    $path =~ s!^\.[/\\]!$cwd/!;
1359    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1360	util::win32_yen_to_slash(\$path);
1361	if ($path =~ m!^//!) {
1362	} elsif ($path =~ m!^/[^/]!) {
1363	    my $driveletter = $cwd;
1364	    if ($driveletter =~ m!^([A-Z]:)!i){
1365		$driveletter = $1;
1366	    }
1367	    $path = "$driveletter$path";
1368	} elsif ($path !~ m!^[A-Z]:/!i) {
1369            $path = "$cwd/$path";
1370        }
1371    } else {
1372        $path =~ s!^([^/])!$cwd/$1!;
1373    }
1374    return $path;
1375}
1376
1377sub find_target (@) {
1378    my @targets = @_;
1379
1380    my $cwd = cwd();
1381    @targets = map { absolute_path($cwd, $_) } @targets;
1382
1383    # Convert \ to / with consideration for Shift_JIS encoding.
1384    if (($English::OSNAME eq "MSWin32") || ($English::OSNAME eq "os2")) {
1385        foreach my $tmp (@targets){
1386	    util::win32_yen_to_slash(\$tmp);
1387	}
1388    }
1389
1390    # For reporting effects of --allow, --deny, --exclude, --mtime
1391    # options in --verbose mode.
1392    my %counts = ();
1393    $counts{'possible'} = 0;
1394    $counts{'excluded'} = 0;
1395    $counts{'too_old'} = 0;
1396    $counts{'too_new'} = 0;
1397    $counts{'not_allowed'} = 0;
1398    $counts{'denied'} = 0;
1399
1400    # Traverse directories.
1401    # This routine is not efficent but I prefer reliable logic.
1402    my @flist = ();
1403    my $start = time();
1404    util::vprint(_("find_target starting: "). localtime($start). "\n");
1405    while (@targets) {
1406	my $target = shift @targets;
1407
1408	if ($target eq '') {
1409	    print STDERR "Warning: target contains empty line, skip it\n";
1410	    next;
1411	}
1412
1413	if (-f $target) { # target is a file.
1414	    add_target($target, \@flist, \%counts);
1415	} elsif (-d $target) { # target is a directory.
1416	    my @subtargets = ();
1417	    # Find subdirectories in target directory
1418	    # because File::Find::find() does not follow symlink.
1419	    if (-l $target) {
1420		my $dh = new DirHandle($target);
1421		while (defined(my $ent = $dh->read)) {
1422		    next if ($ent =~ /^\.{1,2}$/);
1423		    if ($English::OSNAME eq "MSWin32" || $English::OSNAME eq "os2") {
1424                        next if ($ent =~ m!^($conf::DENY_DDN)$!i);
1425                        my $tmp = $ent;
1426                        util::win32_yen_to_slash(\$tmp);
1427                        next if ($ent ne $tmp);
1428                    }
1429		    my $fname = "$target/$ent";
1430		    next if ($fname eq '.' || $fname eq '..');
1431		    if (-d $fname) {
1432			push(@subtargets, $fname);
1433		    } else {
1434			add_target($fname, \@flist, \%counts);
1435		    }
1436		}
1437	    } else {
1438		@subtargets = ($target);
1439	    }
1440
1441	    #
1442	    # Wanted routine for File::Find's find().
1443	    #
1444	    my $wanted_closure = sub {
1445		my $fname = "$File::Find::dir/$_";
1446		add_target($fname, \@flist, \%counts);
1447	    };
1448
1449	    find($wanted_closure, @subtargets) if (@subtargets > 0);
1450	} else {
1451	    print STDERR _("unsupported target: ") . $target;
1452	}
1453    }
1454
1455    # uniq @flist
1456    my %mark = ();
1457    @flist = grep {$mark{$_}++; $mark{$_} == 1} @flist;
1458
1459    # Sort file names with consideration for numbers.
1460    @flist = map  { $_->[0] }
1461	     sort { $a->[1] cmp $b->[1] }
1462	     map  { my $tmp = $_; $tmp =~ s/(\d+)/sprintf("%08d", $1)/ge;
1463		    [ $_, $tmp ] } @flist;
1464
1465    my $elapsed = time() - $start ;
1466    $elapsed += 1 ;   # to round up and avoid 0
1467
1468    # For --verbose option.
1469    report_find_target($elapsed, $#flist + 1, %counts);
1470
1471    return @flist;
1472}
1473
1474sub add_target ($\@\%) {
1475    my ($target, $flists_ref, $counts_ref) = @_;
1476
1477    if ($target =~ /[\n\r\t]/) {
1478	$target =~ s/[\n\r\t]//g;
1479	print STDERR "Warning: $target contains LF/CR/TAB chars, skip it\n";
1480	return;   # skip a file name containing LF/CR/TAB chars.
1481    }
1482
1483    return unless -f $target;  # Only file is targeted.
1484
1485    $counts_ref->{'possible'}++;
1486
1487    unless (util::canopen($target)) {
1488        util::vprint(sprintf(_("Unreadable:	%s"), $target));
1489	$counts_ref->{'excluded'}++;
1490	return;
1491    }
1492
1493
1494    if (defined $conf::EXCLUDE_PATH &&
1495	$target =~ /$conf::EXCLUDE_PATH/ )
1496    {
1497        util::vprint(sprintf(_("Excluded:	%s"), $target));
1498	$counts_ref->{'excluded'}++;
1499	return;
1500    }
1501
1502    #
1503    # Do processing just like find's  --mtime option.
1504    #
1505    if (defined $var::Opt{'mtime'}) {
1506	my $mtime = -M $_;
1507	if ($var::Opt{'mtime'} < 0) {
1508
1509	    # This must be `>=' not `>' for consistency with find(1).
1510	    if (int($mtime) >= - $var::Opt{'mtime'}) {
1511	        util::vprint(sprintf(_("Too old:	%s"), $target));
1512	        $counts_ref->{'too_old'}++;
1513		return;
1514	    }
1515	} elsif ($var::Opt{'mtime'} > 0) {
1516	    if ($var::Opt{'mtime'} =~ /^\+/) {
1517		if ((int($mtime) < $var::Opt{'mtime'})) {
1518		    util::vprint(sprintf(_("Too new:	%s"), $target));
1519		    $counts_ref->{'too_new'}++;
1520		    return;
1521		}
1522	    } else {
1523		if (int($mtime) != $var::Opt{'mtime'}) {
1524		    if (int($mtime) > $var::Opt{'mtime'}) {
1525		        util::vprint(sprintf(_("Too old:	%s"),$target));
1526			$counts_ref->{'too_old'}++;
1527		    } else {
1528		        util::vprint(sprintf(_("Too new:	%s"),$target));
1529			$counts_ref->{'too_new'}++;
1530		    }
1531		    return;
1532		}
1533	    }
1534	} else {
1535	    # $var::Opt{'mtime'} == 0 ;
1536	    return;
1537	}
1538    }
1539
1540    # Extract the file name of the target.
1541    $target =~ m!^.*/([^/]+)$!;
1542    my $fname = $1;
1543
1544    if ($fname =~ m!^($conf::DENY_FILE)$!i ) {
1545        util::vprint(sprintf(_("Denied:	%s"), $target));
1546	$counts_ref->{'denied'}++;
1547	return;
1548    }
1549    if ($fname !~ m!^($conf::ALLOW_FILE)$!i) {
1550        util::vprint(sprintf(_("Not allowed:	%s"), $target));
1551	$counts_ref->{'not_allowed'}++;
1552	return;
1553    } else{
1554        util::vprint(sprintf(_("Targeted:	%s"), $target));
1555	push @$flists_ref, $target;
1556    }
1557
1558}
1559
1560sub report_find_target ($$%) {
1561    my ($elapsed, $num_targeted, %counts) = @_;
1562
1563    util::vprint(_("find_target finished: ") . localtime(time()). "\n");
1564    util::vprint(sprintf(_("Target Files: %d (Scan Performance: Elapsed Sec.: %d, Files/sec: %.1f)"),
1565			 $num_targeted, $elapsed,
1566			 $num_targeted /$elapsed));
1567    util::vprint(sprintf(_("  Possible: %d, Not allowed: %d, Denied: %d, Excluded: %d"),
1568			 $counts{'possible'},
1569			 $counts{'not_allowed'},
1570			 $counts{'denied'},
1571			 $counts{'excluded'}));
1572    util::vprint(sprintf(_("  MTIME too old: %d, MTIME too new: %d"),
1573			 $counts{'too_old'},
1574			 $counts{'too_new'}));
1575}
1576
1577sub show_usage () {
1578    util::dprint(_("lang_msg: ")."$util::LANG_MSG\n");
1579    util::dprint(_("lang: ")."$util::LANG\n");
1580
1581    my $usage = $usage::USAGE;
1582    $usage = _($usage);
1583    printf "$usage", $var::VERSION, $var::TRAC_URI, $var::MAILING_ADDRESS;
1584}
1585
1586sub show_mini_usage () {
1587    print _("Usage: mknmz [options] <target>...\n");
1588    print _("Try `mknmz --help' for more information.\n");
1589}
1590
1591sub show_version () {
1592    print $usage::VERSION_INFO;
1593}
1594
1595#
1596# check the file. No $msg is good.
1597#
1598sub check_file ($$$$$) {
1599    my ($cfile, $cfile_size, $text_size, $mtype, $uri) = @_;
1600
1601    my $msg = undef;
1602    if ($mtype =~ /; x-system=unsupported$/) {
1603	$mtype =~ s/; x-system=unsupported$//;
1604	$msg = _("Unsupported media type ")."($mtype)"._(" skipped.");
1605    } elsif ($mtype =~ /; x-error=file_size_max/) {
1606        $msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
1607    } elsif ($mtype =~ /; x-error=.*$/) {
1608	$mtype =~ s/^.*; x-error=(.*)$/$1/;
1609	$msg = $mtype;
1610    } elsif ($mtype =~ /^x-system/) {
1611	$msg = _("system error occurred! ")."($mtype)"._(" skipped.");
1612    } elsif (! -e $cfile) {
1613	$msg = _("does NOT EXIST! skipped.");
1614    } elsif (! util::canopen($cfile)) {
1615	$msg = _("is NOT READABLE! skipped.");
1616    } elsif ($text_size == 0 || $cfile_size == 0) {
1617	$msg = _("is 0 size! skipped.");
1618    } elsif ($mtype =~ /^application\/octet-stream/) {
1619	$msg = _("may be a BINARY file! skipped.");
1620    } elsif ($cfile_size > $conf::FILE_SIZE_MAX) {
1621	$msg = _("is larger than your setup before filtered, skipped: ") . 'conf::FILE_SIZE_MAX (' . $conf::FILE_SIZE_MAX . ') < '. $cfile_size ;
1622    } elsif ($text_size > $conf::TEXT_SIZE_MAX) {
1623	$msg = _("is larger than your setup after filtered, skipped: ") . 'conf::TEXT_SIZE_MAX (' . $conf::TEXT_SIZE_MAX . ') < '. $text_size ;
1624    }
1625
1626    return $msg;
1627}
1628
1629
1630#
1631# Write NMZ.version file.
1632#
1633sub write_version() {
1634    unless (-f $var::NMZ{'version'}) {
1635	my $fh = util::efopen(">$var::NMZ{'version'}");
1636	print $fh "Namazu-Index-Version: $NAMAZU_INDEX_VERSION\n";
1637        util::fclose($fh);
1638    }
1639}
1640
1641#
1642# rename each temporary file to a real file name.
1643#
1644sub write_nmz_files () {
1645    util::Rename($var::NMZ{'_i'},   $var::NMZ{'i'});
1646    util::Rename($var::NMZ{'_ii'}, $var::NMZ{'ii'});
1647    util::Rename($var::NMZ{'_w'},  $var::NMZ{'w'});
1648    util::Rename($var::NMZ{'_wi'}, $var::NMZ{'wi'});
1649    util::Rename($var::NMZ{'_p'},  $var::NMZ{'p'});
1650    util::Rename($var::NMZ{'_pi'}, $var::NMZ{'pi'});
1651}
1652
1653# output NMZ.body
1654sub write_body_msg () {
1655    for my $file (glob "$TEMPLATEDIR/NMZ.body*") {
1656        if ($file =~ m!^.*/NMZ\.body(\.[-\w\.]+)?$!){
1657            my $suffix = $1 ? $1 : '';
1658            write_message("$var::NMZ{'body'}${suffix}");
1659        }
1660    }
1661}
1662
1663# output NMZ.tips
1664sub write_tips_msg () {
1665    for my $file (glob "$TEMPLATEDIR/NMZ.tips*") {
1666        if ($file =~ m!^.*/NMZ\.tips(\.[-\w\.]+)?$!){
1667            my $suffix = $1 ? $1 : '';
1668            write_message("$var::NMZ{'tips'}${suffix}");
1669        }
1670    }
1671}
1672
1673
1674# output NMZ.result.*
1675sub write_result_file () {
1676    my $fname = "NMZ.result.normal";
1677
1678    my @files = glob "$TEMPLATEDIR/NMZ.result.*";
1679
1680    for my $file (@files) {
1681	$file =~ m!(NMZ\.result\.[^/]*)$!;
1682	my $target = "$var::OUTPUT_DIR/$1";
1683	if (-f $target) {  # already exist;
1684	    next;
1685	} else {
1686	    my $buf = util::readfile($file);
1687	    my $fh_file = util::efopen(">$target");
1688	    print $fh_file $buf;
1689            util::fclose($fh_file);
1690	}
1691    }
1692}
1693
1694# write NMZ.body and etc.
1695sub write_message ($) {
1696    my ($msgfile) = @_;
1697
1698    if (! -f $msgfile) {
1699	my ($template, $fname);
1700
1701	$msgfile =~ m!.*/(.*)$!;
1702	$fname = $1;
1703	$template = "$TEMPLATEDIR/$fname";
1704
1705	if (-f $template) {
1706	    my $buf = util::readfile($template);
1707	    my $fh_output = util::efopen(">$msgfile");
1708	    print $fh_output $buf;
1709            util::fclose($fh_output);
1710	}
1711    }
1712}
1713
1714
1715#
1716# Make the NMZ.slog file for logging.
1717#
1718sub make_slog_file () {
1719    if (! -f $var::NMZ{'slog'}) {
1720	my $fh = util::efopen(">$var::NMZ{'slog'}");
1721        util::fclose($fh);
1722        undef $fh;
1723        chmod 0666, $var::NMZ{'slog'};
1724    }
1725    {
1726	my $fh_slogfile = util::efopen(">>$var::NMZ{'slog'}");
1727        util::fclose($fh_slogfile);
1728    }
1729}
1730
1731
1732#
1733# Concatenate $CURRENTDIR to the head of each file.
1734#
1735sub change_filenames ($) {
1736    my $dir = $var::OUTPUT_DIR;
1737
1738    for my $key (sort keys %var::NMZ) {
1739	next if $key =~ /^_/;    # exclude temporary file
1740	$var::NMZ{$key} = "$dir/$var::NMZ{$key}";
1741    }
1742
1743    # temporary files
1744    for my $key (sort keys %var::NMZ) {
1745	if ($key =~ /^_/) {
1746	    $var::NMZ{$key} = util::tmpnam($var::NMZ{$key});
1747	}
1748    }
1749
1750    if ($var::Opt{'debug'}) {
1751	for my $key (sort keys %var::NMZ) {
1752	    util::dprint("NMZ: $var::NMZ{$key}\n");
1753	}
1754    }
1755}
1756
1757
1758#
1759# Preparation processing for appending index files.
1760#
1761sub append_index (@) {
1762    my @flist = @_;
1763
1764    my $docid_base = 0;
1765    ($docid_base, @flist) = set_target_files(@flist);
1766
1767    unless (@flist) { 	# if @flist is empty
1768	if ($DeletedFilesCount > 0 || $UpdatedFilesCount > 0) {
1769	    set_lockfile($var::NMZ{'lock2'});
1770	    update_dateindex();
1771	    update_registry(0);
1772	    make_headfoot_pages(0, get_total_keys());
1773	    put_log(0, 0, 0, get_total_keys());
1774	    make_headfoot_pages(get_status("files"), get_status("keys"));
1775	    util::remove_tmpfiles();
1776	}
1777	print _("No files to index.\n");
1778	exit 0;
1779    }
1780
1781    $APPENDMODE = 1;
1782    # conserve files by copying
1783    copy($var::NMZ{'i'},  $var::NMZ{'_i'});
1784    copy($var::NMZ{'w'},  $var::NMZ{'_w'});
1785    copy($var::NMZ{'t'},  $var::NMZ{'_t'})
1786	unless -f $var::NMZ{'_t'}; # preupdated ?
1787    copy($var::NMZ{'p'},  $var::NMZ{'_p'});
1788    copy($var::NMZ{'pi'}, $var::NMZ{'_pi'});
1789
1790    return ($docid_base, @flist);
1791}
1792
1793#
1794# Set target files to @flist and return with the number of regiested files.
1795#
1796sub set_target_files() {
1797    my %rdocs;    # 'rdocs' means 'registered documents'
1798    my @found_files = @_;
1799
1800    # Load the list of registered documents
1801    $rdocs{'name'} = load_registry();
1802
1803    # Pick up overlapped documents and do marking
1804    my %mark1;
1805    my @overlapped_files;
1806    grep {$_ !~ /^\# / && $mark1{$_}++ } @{$rdocs{'name'}};
1807    $rdocs{'overlapped'} = {}; # Prepare an anonymous hash.
1808    for my $overlapped (grep { $mark1{$_} } @found_files) {
1809	$rdocs{'overlapped'}{$overlapped} = 1;
1810	push @overlapped_files, $overlapped;
1811    };
1812
1813    # Pick up not overlapped documents which are files to index.
1814    my @flist = grep { ! $mark1{$_} } @found_files;
1815
1816    if ($var::Opt{'noupdate'}) {
1817	return (scalar @{$rdocs{'name'}}, @flist);
1818    };
1819
1820    # Load the date index.
1821    $rdocs{'mtime'} = load_dateindex();
1822
1823    if (@{$rdocs{'mtime'}} == 0) {
1824	return (scalar @{$rdocs{'name'}}, @flist);
1825    };
1826
1827    util::assert(@{$rdocs{'name'}} == @{$rdocs{'mtime'}},
1828		 "NMZ.r ($#{$rdocs{'name'}}) and NMZ.t ($#{$rdocs{'mtime'}}) are not consistent!");
1829
1830    # Pick up deleted documents and do marking
1831    # (registered in the NMZ.r but not existent in the filesystem)
1832    my @deleted_documents;
1833    unless ($var::Opt{'nodelete'}) {
1834	my %mark2;
1835	grep { $mark2{$_}++ } @found_files;
1836	for my $deleted (grep { $_ !~ /^\# / && ! $mark2{$_} &&
1837				! $rdocs{'overlapped'}{$_} }
1838			 @{$rdocs{'name'}})
1839	{
1840	    $rdocs{'deleted'}{$deleted} = 1;
1841	    push @deleted_documents, $deleted;
1842	}
1843    }
1844
1845    # check filesize
1846    if ($var::Opt{'checkfilesize'}) {
1847	$rdocs{'size'} = load_sizefield();
1848    }
1849
1850    # Pick up updated documents and set the missing number for deleted files.
1851    my @updated_documents = pickup_updated_documents(\%rdocs);
1852
1853    # Append updated files to the list of files to index.
1854    if (@updated_documents) {
1855	push @flist, @updated_documents;
1856    }
1857
1858    # Remove duplicates.
1859    my %seen = ();
1860    @flist = grep { ! $seen{$_}++ } @flist;
1861
1862    util::dprint(_("\n\n== found files ==\n"), join("\n", @found_files), "\n");
1863    util::dprint(_("\n\n== registered documents ==\n"), join("\n", @{$rdocs{'name'}}), "\n");
1864    util::dprint(_("\n\n== overlapped documents ==\n"), join("\n", @overlapped_files), "\n");
1865    util::dprint(_("\n\n== deleted documents ==\n"), join("\n", @deleted_documents), "\n");
1866    util::dprint(_("\n\n== updated documents ==\n"), join("\n", @updated_documents), "\n");
1867    util::dprint(_("\n\n== files to index ==\n"), join("\n", @flist), "\n");
1868
1869    # Update NMZ.t with the missing number infomation and
1870    # append updated files and deleted files to NMZ.r with leading '# '
1871    if (@updated_documents || @deleted_documents) {
1872	$DeletedFilesCount = 0;
1873	$UpdatedFilesCount = 0;
1874	$UpdatedFilesCount += @updated_documents;
1875#	$DeletedFilesCount += @updated_documents;
1876	$DeletedFilesCount += @deleted_documents;
1877	preupdate_dateindex(@{$rdocs{'mtime'}});
1878	preupdate_registry(@updated_documents, @deleted_documents);
1879    }
1880
1881    # Return the number of registered documents and list of files to index.
1882    return (scalar @{$rdocs{'name'}}, @flist);
1883}
1884
1885sub preupdate_registry(@) {
1886    my (@list) = @_;
1887
1888    my $fh_registry = util::efopen(">$var::NMZ{'_r'}");
1889    @list = grep { s/(.*)/\# $1\n/ } @list;
1890    print $fh_registry @list;
1891    print $fh_registry &_("## deleted: ") . util::rfc822time(time()) . "\n\n";
1892    util::fclose($fh_registry);
1893}
1894
1895sub preupdate_dateindex(@) {
1896    my @mtimes = @_;
1897
1898    # Since rewriting the entire file, it is not efficient,
1899    # but simple and reliable. this would be revised in the future.
1900    my $fh_dateindex = util::efopen(">$var::NMZ{'_t'}");
1901#    print "\nupdate_dateindex\n", join("\n", @mtimes), "\n\n";
1902    print $fh_dateindex pack("N*", @mtimes);
1903    util::fclose($fh_dateindex);
1904}
1905
1906sub update_registry ($) {
1907    my ($docid_count) = @_;
1908
1909    {
1910	my $fh_registry = util::efopen(">>$var::NMZ{'r'}");
1911	my $fh_registry_ = util::efopen($var::NMZ{'_r'});
1912	while (defined(my $line = <$fh_registry_>)) {
1913	    print $fh_registry $line;
1914	}
1915	if ($docid_count > 0) {
1916	    print $fh_registry &_("## indexed: ") . util::rfc822time(time()) . "\n\n";
1917	}
1918        util::fclose($fh_registry_) if (defined $fh_registry_);
1919        util::fclose($fh_registry);
1920    }
1921    unlink $var::NMZ{'_r'};
1922}
1923
1924sub update_dateindex () {
1925    util::Rename($var::NMZ{'_t'}, $var::NMZ{'t'});
1926}
1927
1928sub update_field_index () {
1929    my @list = glob "$var::NMZ{'field'}.*.tmp";
1930    for my $tmp (@list) {
1931        if ($tmp =~ m!((^.*/NMZ\.field\..+?(?:\.i)?)\.tmp$)!) {
1932	    my $fname_tmp = $1;
1933	    my $fname_out = $2;
1934	    {
1935		my $fh_field = util::efopen(">>$fname_out");
1936		my $fh_tmp = util::efopen($fname_tmp);
1937
1938		while (defined(my $line = <$fh_tmp>)) {
1939		    print $fh_field $line;
1940		}
1941                util::fclose($fh_tmp) if (defined $fh_tmp);
1942                util::fclose($fh_field);
1943	    }
1944	    unlink $fname_tmp;
1945	} else {
1946	    util::cdie(_("update_field_index: ")."@list");
1947	}
1948    }
1949}
1950
1951sub pickup_updated_documents (\%) {
1952    my ($rdocs_ref) = @_;
1953    my @updated_documents = ();
1954
1955    # To avoid duplicated outputs caused by --html-split support.
1956    my %printed = ();
1957    my $i = 0;
1958    for my $cfile (@{$rdocs_ref->{'name'}}) {
1959	if (defined($rdocs_ref->{'deleted'}{$cfile})) {
1960	    unless ($printed{$cfile}) {
1961		print "$cfile " . _("was deleted!\n");
1962		$printed{$cfile} = 1;
1963	    }
1964	    $rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1965	} elsif (defined($rdocs_ref->{'overlapped'}{$cfile})) {
1966	    my $cfile_mtime = (stat($cfile))[9];
1967	    my $rfile_mtime = $rdocs_ref->{'mtime'}[$i];
1968	    my ($cfile_size, $rfile_size);
1969	    if ($var::Opt{'checkfilesize'}) {
1970		$cfile_size = (stat($cfile))[7];
1971		$rfile_size = $rdocs_ref->{'size'}[$i];
1972	    }
1973
1974	    if ($rfile_mtime != $cfile_mtime ||
1975		($var::Opt{'checkfilesize'} && ($cfile_size != $rfile_size))) {
1976		# The file is updated!
1977		unless ($printed{$cfile}) {
1978		    print "$cfile " . _("was updated!\n");
1979		    $printed{$cfile} = 1;
1980		}
1981		push(@updated_documents, $cfile);
1982		$rdocs_ref->{'mtime'}[$i] = -1; # Assign the missing number.
1983	    }
1984	}
1985	$i++;
1986    }
1987
1988    return @updated_documents
1989}
1990
1991sub load_dateindex() {
1992    my $fh_dateindex = util::efopen($var::NMZ{'t'});
1993
1994    my $size = -s $var::NMZ{'t'};
1995    my $buf  = "";
1996    read($fh_dateindex, $buf, $size);
1997    my @list = unpack("N*", $buf);  # load date index
1998#    print "\nload_dateindex\n", join("\n", @list), "\n\n";
1999
2000    util::fclose($fh_dateindex);
2001    return [ @list ];
2002}
2003
2004sub load_registry () {
2005    my $fh_registry = util::efopen($var::NMZ{'r'});
2006
2007    my @list = ();
2008    my %deleted    = ();
2009    my @registered = ();
2010
2011    while (defined(my $line = <$fh_registry>)) {
2012	chomp($line);
2013	next if $line =~ /^\s*$/;  # an empty line
2014	next if $line =~ /^##/;    # a comment
2015	if ($line =~ s/^\#\s+//) { # deleted document
2016	    $deleted{$line}++;
2017	} else {
2018	    # Remove HTML's anchor generated by --html-split option.
2019	    $line =~ s/\t.*$//g;
2020	    push @registered, $line;
2021	}
2022    }
2023
2024    util::fclose($fh_registry) if (defined $fh_registry);
2025
2026    # Exclude deleted documents.
2027    for my $doc (@registered) {
2028	if ($deleted{$doc}) {
2029	    push @list, "# $doc";
2030	    $deleted{$doc}--;
2031	} else {
2032	    push @list, $doc;
2033	}
2034    }
2035
2036    return [ @list ];
2037}
2038
2039# get file size information from NMZ.field.size
2040sub load_sizefield() {
2041    my $fh_sizefield = util::efopen($var::NMZ{'field'} . '.size');
2042    return [] unless defined $fh_sizefield;
2043    my $line;
2044    my @ret = ();
2045    while (defined($line = <$fh_sizefield>)) {
2046	chomp $line;
2047	push @ret, $line;
2048    }
2049    util::fclose($fh_sizefield) if (defined $fh_sizefield);
2050    return \@ret;
2051}
2052
2053sub get_total_keys() {
2054    my $keys = get_status("keys");
2055    $keys =~ s/,//g if (defined $keys);
2056    $keys = 0 unless defined $keys;
2057    return $keys;
2058}
2059
2060sub get_total_files() {
2061    my $files = get_status("files");
2062    $files =~ s/,//g if (defined $files);
2063    $files = 0 unless defined $files;
2064    return $files;
2065}
2066
2067sub get_status($) {
2068    my ($key) = @_;
2069
2070    my $fh = util::fopen($var::NMZ{'status'});
2071    return undef unless defined $fh;
2072
2073    while (defined(my $line = <$fh>)) {
2074	if ($line =~ /^$key\s+(.*)$/) {
2075	    util::dprint("status: $key = $1\n");
2076	    $fh->close;
2077	    return $1;
2078	}
2079    }
2080    util::fclose($fh) if (defined $fh);
2081    return undef;
2082}
2083
2084sub put_total_files($) {
2085    my ($number) = @_;
2086    $number =~ tr/,//d;
2087    put_status("files", $number);
2088}
2089
2090sub put_total_keys($) {
2091    my ($number) = @_;
2092    $number =~ tr/,//d;
2093    put_status("keys", $number);
2094}
2095
2096sub put_status($$) {
2097    my ($key, $value) = @_;
2098
2099    # remove NMZ.status file if the file has a previous value.
2100    unlink $var::NMZ{'status'} if defined get_status($key);
2101
2102    my $fh = util::efopen(">> $var::NMZ{'status'}");
2103    print $fh "$key $value\n";
2104    util::fclose($fh);
2105}
2106
2107# do logging
2108sub put_log ($$$$) {
2109    my ($total_files_size, $start_time, $docid_count, $total_keys_count) = @_;
2110
2111    my $date = localtime;
2112    my $added_files_count   = $docid_count;
2113    my $deleted_documents_count = $DeletedFilesCount;
2114    my $updated_documents_count = $UpdatedFilesCount;
2115    my $total_files_count   = get_total_files() + $docid_count
2116			      - $DeletedFilesCount - $UpdatedFilesCount;
2117    my $added_keys_count    = 0;
2118    $added_keys_count       = $total_keys_count - get_total_keys();
2119
2120    my $processtime         = time - $start_time;
2121    $processtime            = 0 if $start_time == 0;
2122    $total_files_size       = $total_files_size;
2123    $total_keys_count       = $total_keys_count;
2124
2125    my @logmsgs = ();
2126    if ($APPENDMODE) {
2127	push @logmsgs, N_("[Append]");
2128    } else {
2129	push @logmsgs, N_("[Base]");
2130    }
2131    push @logmsgs, N_("Date:"), "$date" if $date;
2132    push @logmsgs, N_("Added Documents:"), util::commas("$added_files_count")
2133	if $added_files_count;
2134    push @logmsgs, N_("Deleted Documents:"),
2135	util::commas("$deleted_documents_count") if $deleted_documents_count;
2136    push @logmsgs, N_("Updated Documents:"),
2137	util::commas("$updated_documents_count") if $updated_documents_count;
2138    push @logmsgs, N_("Size (bytes):"), util::commas("$total_files_size")
2139	if $total_files_size;
2140    push @logmsgs, N_("Total Documents:"), util::commas("$total_files_count")
2141	if $total_files_count;
2142    push @logmsgs, N_("Added Keywords:"), util::commas("$added_keys_count")
2143	if $added_keys_count;
2144    push @logmsgs, N_("Total Keywords:"), util::commas("$total_keys_count")
2145	if $total_keys_count;
2146    push @logmsgs, N_("Wakati:"), "$conf::WAKATI" if $conf::WAKATI;
2147    push @logmsgs, N_("Time (sec):"), util::commas("$processtime")
2148	if $processtime;
2149    push @logmsgs, N_("File/Sec:"),  sprintf "%.2f",
2150    	(($added_files_count + $updated_documents_count) / $processtime)
2151	if $processtime;
2152    push @logmsgs, N_("System:"), "$English::OSNAME" if $English::OSNAME;
2153    push @logmsgs, N_("Perl:"),   sprintf("%f", $English::PERL_VERSION);
2154    push @logmsgs, N_("Namazu:"), "$var::VERSION" if $var::VERSION;
2155
2156    my $log_for_file = "";
2157
2158    my $msg = shift @logmsgs;	# [Base] or [Append]
2159    # To stdout, use gettext.
2160    print _($msg), "\n";
2161    # To log file, do not use gettext.
2162    $log_for_file = $msg . "\n";
2163    while (@logmsgs) {
2164	my $field = shift @logmsgs;
2165	my $value = shift @logmsgs;
2166	printf "%-20s %s\n", _($field), "$value";
2167	$log_for_file .= sprintf "%-20s %s\n", $field, "$value";
2168    }
2169    print "\n";
2170    $log_for_file .= "\n";
2171
2172    put_log_to_logfile($log_for_file);
2173    put_total_files($total_files_count);
2174    put_total_keys($total_keys_count);
2175
2176    my $argv = join "\t", @ARGV;
2177    my $cwd  = cwd();
2178    put_status("argv", $argv);
2179    put_status("cwd",  $cwd);
2180}
2181
2182sub put_log_to_logfile ($) {
2183    my ($logmsg) = @_;
2184    my $fh_logfile = util::efopen(">>$var::NMZ{'log'}");
2185    print $fh_logfile $logmsg;
2186    util::fclose($fh_logfile);
2187}
2188
2189sub get_year() {
2190    my $year = (localtime)[5] + 1900;
2191
2192    return $year;
2193}
2194
2195# Compose NMZ.head and NMZ.foot. Prepare samples if necessary.
2196# Insert $docid_count, $key_count, and $month/$day/$year respectively.
2197sub make_headfoot ($$$) {
2198    my ($file, $docid_count, $key_count) = @_;
2199
2200    my $day   = sprintf("%02d", (localtime)[3]);
2201    my $month = sprintf("%02d", (localtime)[4] + 1);
2202    my $year  = get_year();
2203    my $buf   = "";
2204
2205    if (-f $file) {
2206	$buf = util::readfile($file);
2207    } else {
2208	$file =~ m!.*/(.*)$!;
2209	my $fname = $1;
2210	my $template = "$TEMPLATEDIR/$fname";
2211
2212	if (-f $template) {
2213	    $buf = util::readfile($template);
2214	} else {
2215	    return;
2216	}
2217    }
2218
2219    my $fh_file = util::efopen(">$file");
2220
2221    if ($buf =~ /(<!-- FILE -->)\s*(.*)\s*(<!-- FILE -->)/) {
2222        my $total_files_count = util::commas(get_total_files() + $docid_count
2223                                   - $DeletedFilesCount - $UpdatedFilesCount);
2224	$buf =~ s/(<!-- FILE -->)(.*)(<!-- FILE -->)/$1 $total_files_count $3/;
2225
2226    }
2227    if ($buf =~ /(<!-- KEY -->)\s*(.*)\s*(<!-- KEY -->)/) {
2228	my $tmp = $2;
2229	$tmp =~ tr/,//d;
2230	$tmp = $key_count;
2231	$tmp = util::commas($tmp);
2232	$buf =~ s/(<!-- KEY -->)(.*)(<!-- KEY -->)/$1 $tmp $3/;
2233    }
2234    $buf =~ s#(<!-- DATE -->)(.*)(<!-- DATE -->)#$1 $year-$month-$day $3#gs;
2235    $buf =~ s/(<!-- VERSION -->)(.*)(<!-- VERSION -->)/$1 v$var::VERSION $3/gs;
2236    $buf =~ s{(<!-- ADDRESS -->)(.*)(<!-- ADDRESS -->)}
2237	     {$1\n<a href="mailto:$conf::ADDRESS">$conf::ADDRESS</a>\n$3}gs;
2238    $buf =~ s{(<!-- LINK-REV-MADE -->)(.*)(<!-- LINK-REV-MADE -->)}
2239	     {$1\n<link rev="made" href="mailto:$conf::ADDRESS">\n$3}gs;
2240
2241    print $fh_file $buf;
2242    util::fclose($fh_file);
2243}
2244
2245# Make phrase hashes for NMZ.p
2246# Handle two words each for calculating a hash value ranged 0-65535.
2247sub make_phrase_hash ($$$) {
2248    my ($docid_count, $docid_base, $contref) = @_;
2249
2250    my %tmp = ();
2251    $$contref =~ s!\x7f */? *\d+ *\x7f!!g;  # remove tags of weight
2252    $$contref =~ tr/\xa1-\xfea-z0-9 \n//cd; # remove all symbols
2253    my @words = split(/\s+/, $$contref);
2254    @words = grep {$_ ne ""} @words;   # remove empty words
2255    my $word_b = shift @words;
2256    my $docid = $docid_count + $docid_base;
2257    for my $word (@words) {
2258	next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2259	my $hash = hash($word_b . $word);
2260	unless (defined $tmp{$hash}) {
2261	    $tmp{$hash} = 1;
2262	    $PhraseHashLast{$hash} = 0 unless defined $PhraseHashLast{$hash};
2263	    $PhraseHash{$hash} .= pack("w", $docid - $PhraseHashLast{$hash});
2264#	    util::dprint("<$word_b, $word> $hash\n");
2265	    $PhraseHashLast{$hash} = $docid;
2266	}
2267	$word_b = $word;
2268    }
2269}
2270
2271# Construct NMZ.p and NMZ.pi file. this processing is rather complex.
2272sub write_phrase_hash () {
2273    write_phrase_hash_sub();
2274    util::Rename($var::NMZ{'__p'}, $var::NMZ{'_p'});
2275    util::Rename($var::NMZ{'__pi'}, $var::NMZ{'_pi'});
2276}
2277
2278sub write_phrase_hash_sub () {
2279    my $opened = 0;
2280
2281    return 0 if %PhraseHash eq '';    # namazu-devel-ja #3146
2282    util::dprint(_("doing write_phrase_hash() processing.\n"));
2283
2284    my $fh_tmp_pi = util::efopen(">$var::NMZ{'__pi'}");
2285    my $fh_tmp_p  = util::efopen(">$var::NMZ{'__p'}");
2286
2287    my $fh_phrase = util::fopen($var::NMZ{'_p'});
2288    my $fh_phraseindex = undef;
2289    if ($fh_phrase) {
2290	$fh_phraseindex = util::efopen($var::NMZ{'_pi'});
2291	$opened = 1;
2292    }
2293
2294    my $ptr = 0;
2295    for (my $i = 0; $i < 65536; $i++) {
2296
2297	my $baserecord = "";
2298	my $baseleng = 0;
2299
2300	if ($opened) {
2301	    my $tmp = 0;
2302	    read($fh_phraseindex, $tmp, $var::INTSIZE);
2303	    $tmp = unpack("N", $tmp);
2304	    if ($tmp != 0xffffffff) { # 0xffffffff
2305		$baseleng = readw($fh_phrase);
2306		read($fh_phrase, $baserecord, $baseleng);
2307	    }
2308	}
2309	if (defined($PhraseHash{$i})) {
2310	    if ($baserecord eq "") {
2311		print $fh_tmp_pi pack("N", $ptr);
2312		my $record = $PhraseHash{$i};
2313		my $n2 = length($record);
2314		my $data = pack("w", $n2) . $record;
2315		print $fh_tmp_p $data;
2316		$ptr += length($data);
2317	    } else {
2318		print $fh_tmp_pi pack("N", $ptr);
2319		my $record = $PhraseHash{$i};
2320		my $last_docid = get_last_docid($baserecord, 1);
2321		my $adjrecord = adjust_first_docid($record, $last_docid);
2322		check_records(\$record, \$baserecord, 1) unless defined $adjrecord; # namazu-bugs-ja#31
2323		$record = $adjrecord;
2324		my $n2 = length($record) + $baseleng;
2325		my $data = pack("w", $n2) .  $baserecord . $record;
2326		print $fh_tmp_p $data;
2327		$ptr += length($data);
2328	    }
2329	} else {
2330	    if ($baserecord eq "") {
2331		# if $baserecord has no data, set to 0xffffffff
2332		print $fh_tmp_pi pack("N", 0xffffffff);
2333	    } else {
2334		print $fh_tmp_pi pack("N", $ptr);
2335		my $data = pack("w", $baseleng) . $baserecord;
2336		print $fh_tmp_p $data;
2337		$ptr += length($data);
2338	    }
2339	}
2340    }
2341
2342    if ($opened) {
2343        util::fclose($fh_phraseindex);
2344    }
2345    if (defined $fh_phrase) {
2346        util::fclose($fh_phrase);
2347    }
2348    util::fclose($fh_tmp_p);
2349    util::fclose($fh_tmp_pi);
2350
2351    %PhraseHash = ();
2352    %PhraseHashLast = ();
2353}
2354
2355# Dr. Knuth's  ``hash'' from (UNIX MAGAZINE May 1998)
2356sub hash ($) {
2357    my ($word) = @_;
2358
2359    my $hash = 0;
2360    for (my $i = 0; $word ne ""; $i++) {
2361	$hash ^= $Seed[$i & 0x03][ord($word)];
2362        $word = substr $word, 1;
2363	# $word =~ s/^.//;  is slower
2364    }
2365    return $hash & 65535;
2366}
2367
2368# Count frequencies of words.
2369sub count_words ($$$$) {
2370    my ($docid_count, $docid_base, $contref, $kanji) = @_;
2371    my (@tmp);
2372
2373    # Normalize into small letter.
2374    $$contref =~ tr/A-Z/a-z/;
2375
2376    # Remove control char.
2377    $$contref =~ tr/\x00-\x08\x0b-\x0c\x0e-\x1a/ /;
2378
2379    # It corresponds to -j option of ChaSen.
2380    $$contref =~ s/^[ \t\f]+//gm;    # except "\r\n"
2381    $$contref =~ s/[ \t\f]+$//gm;    # except "\r\n"
2382    $$contref =~ s/([a-z])-\n([a-z])/$1$2/gsi;   # for hyphenation
2383    if (util::islang("ja")) {
2384        $$contref =~ s/([\x80-\xff])\n([\x80-\xff])/$1$2/gs;
2385        $$contref =~ s/(��|��)/$1\n/gs;
2386    }
2387    $$contref =~ s/\n+/\n/gs;
2388
2389    # Do wakatigaki if necessary.
2390    if (util::islang("ja")) {
2391	wakati::wakatize_japanese($contref) if $kanji;
2392    }
2393
2394    my $part1 = "";
2395    my $part2 = "";
2396    if ($$contref =~ /\x7f/) {
2397	$part1 = substr $$contref, 0, index($$contref, "\x7f");
2398	$part2 = substr $$contref, index($$contref, "\x7f");
2399#	$part1 = $PREMATCH;  # $& and friends are not efficient
2400#	$part2 = $MATCH . $POSTMATCH;
2401    } else {
2402	$part1 = $$contref;
2403	$part2 = "";
2404    }
2405
2406    # do scoring
2407    my %word_count = ();
2408    $part2 =~ s!\x7f *(\d+) *\x7f([^\x7f]*)\x7f */ *\d+ *\x7f!
2409	wordcount_sub($2, $1, \%word_count)!ge;
2410    wordcount_sub($part1, 1, \%word_count);
2411
2412    # Add them to whole index
2413    my $docid = $docid_count + $docid_base;
2414    for my $word (keys(%word_count)) {
2415	next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2416	$KeyIndexLast{$word} = 0 unless defined $KeyIndexLast{$word};
2417	$KeyIndex{$word} .= pack("w2",
2418				 $docid - $KeyIndexLast{$word},
2419				 $word_count{$word});
2420	$KeyIndexLast{$word} = $docid;
2421    }
2422}
2423
2424#
2425# Count words and do score weighting
2426#
2427sub wordcount_sub ($$\%) {
2428    my ($text, $weight, $word_count) = @_;
2429
2430    # Remove all symbols when -K option is specified.
2431    $text =~ tr/\xa1-\xfea-z0-9/   /c if $var::Opt{'nosymbol'};
2432
2433    # Count frequencies of words in a current document.
2434    # Handle symbols as follows.
2435    #
2436    # tcp/ip      ->  tcp/ip,     tcp,      ip
2437    # (tcp/ip)    ->  (tcp/ip),   tcp/ip,   tcp, ip
2438    # ((tcpi/ip)) ->  ((tcp/ip)), (tcp/ip), tcp
2439    #
2440    # Don't do processing for nested symbols.
2441    # NOTE: When -K is specified, all symbols are already removed.
2442
2443    my @words = split /\s+/, $text;
2444    for my $word (@words) {
2445	next if ($word eq "" || length($word) > $conf::WORD_LENG_MAX);
2446	if ($var::Opt{'noedgesymbol'}) {
2447	    # remove symbols at both ends
2448	    $word =~ s/^[^\xa1-\xfea-z_0-9]*(.*?)[^\xa1-\xfea-z_0-9]*$/$1/g;
2449	}
2450	$word_count->{$word} = 0 unless defined($word_count->{$word});
2451	$word_count->{$word} += $weight;
2452	unless ($var::Opt{'nosymbol'}) {
2453	    if ($word =~ /^[^\xa1-\xfea-z_0-9](.+)[^\xa1-\xfea-z_0-9]$/) {
2454		$word_count->{$1} = 0 unless defined($word_count->{$1});
2455		$word_count->{$1} += $weight;
2456		next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2457	    } elsif ($word =~ /^[^\xa1-\xfea-z_0-9](.+)/) {
2458		$word_count->{$1} = 0 unless defined($word_count->{$1});
2459		$word_count->{$1} += $weight;
2460		next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2461	    } elsif ($word =~ /(.+)[^\xa1-\xfea-z_0-9]$/) {
2462		$word_count->{$1} = 0 unless defined($word_count->{$1});
2463		$word_count->{$1} += $weight;
2464		next unless $1 =~ /[^\xa1-\xfea-z_0-9]/;
2465	    }
2466	    my @words_ = split(/[^\xa1-\xfea-z_0-9]+/, $word)
2467		if $word =~ /[^\xa1-\xfea-z_0-9]/;
2468	    for my $tmp (@words_) {
2469		next if $tmp eq "";
2470		$word_count->{$tmp} = 0 unless defined($word_count->{$tmp});
2471		$word_count->{$tmp} += $weight;
2472	    }
2473	    @words_ = ();
2474	}
2475    }
2476    return "";
2477}
2478
2479# Construct NMZ.i and NMZ.ii file. this processing is rather complex.
2480sub write_index () {
2481    my $key_count = write_index_sub();
2482    util::Rename($var::NMZ{'__i'}, $var::NMZ{'_i'});
2483    util::Rename($var::NMZ{'__w'}, $var::NMZ{'_w'});
2484
2485    return $key_count;
2486}
2487
2488# readw: read one pack 'w' word.
2489# This code was contributed by <furukawa@tcp-ip.or.jp>.
2490sub readw ($) {
2491    my $fh = shift;
2492    my $ret = '';
2493    my $c;
2494
2495    while (read($fh, $c, 1)){
2496	$ret .= $c;
2497	last unless 0x80 & ord $c;
2498    }
2499    return unpack('w', $ret);
2500}
2501
2502sub get_last_docid ($$) {
2503    my ($record, $step) = @_;
2504    my (@data) = unpack 'w*', $record;
2505
2506    my $sum = 0;
2507    for (my $i = 0; $i < @data; $i += $step) {
2508	$sum += $data[$i];
2509    }
2510    my $leng = @data / $step;
2511    return $sum;
2512}
2513
2514sub adjust_first_docid ($$) {
2515    my ($record, $last_docid) = @_;
2516    my (@data) = unpack 'w*', $record;
2517
2518    $data[0] = $data[0] - $last_docid;
2519    return undef if ($data[0] < 0); # namazu-bug-ja#31
2520    $record = pack 'w*', @data;
2521    return $record;
2522}
2523
2524sub write_index_sub () {
2525    my @words = sort keys(%KeyIndex);
2526    return 0 if $#words == -1;
2527
2528    my $cnt = 0;
2529    my $ptr_i = 0;
2530    my $ptr_w = 0;
2531    my $key_count = 0;
2532    my $baserecord = "";
2533
2534    util::dprint(_("doing write_index() processing.\n"));
2535    my $fh_tmp_i  = util::efopen(">$var::NMZ{'__i'}");
2536    my $fh_tmp_w  = util::efopen(">$var::NMZ{'__w'}");
2537    my $fh_i      = util::fopen($var::NMZ{'_i'});
2538    my $fh_ii     = util::efopen(">$var::NMZ{'_ii'}");
2539    my $fh_w      = util::fopen($var::NMZ{'_w'});
2540    my $fh_wi = util::efopen(">$var::NMZ{'_wi'}");
2541
2542    if ($fh_w) {
2543      FOO:
2544	while (defined(my $line = <$fh_w>)) {
2545	    chop $line;
2546	    my $current_word = $line;
2547
2548	    my $baseleng = readw($fh_i);
2549	    read($fh_i, $baserecord, $baseleng);
2550
2551 	    for (; $cnt < @words; $cnt++) {
2552		last unless $words[$cnt] le $current_word;
2553		my $record = $KeyIndex{$words[$cnt]};
2554		my $leng = length($record);
2555
2556		if ($current_word eq $words[$cnt]) {
2557		    my $last_docid = get_last_docid($baserecord, 2);
2558		    my $adjrecord = adjust_first_docid($record, $last_docid);
2559		    check_records(\$record, \$baserecord, 2) unless defined $adjrecord; # namazu-bugs-ja#31
2560		    $record = $adjrecord;
2561		    $leng = length($record);  # re-measure
2562		    my $tmp = pack("w", $leng + $baseleng);
2563
2564		    my $data_i = "$tmp$baserecord$record";
2565		    my $data_w = "$current_word\n";
2566		    print $fh_tmp_i $data_i;
2567		    print $fh_tmp_w $data_w;
2568		    print $fh_ii pack("N", $ptr_i);
2569		    print $fh_wi pack("N", $ptr_w);
2570		    $ptr_i += length($data_i);
2571		    $ptr_w += length($data_w);
2572		    $key_count++;
2573
2574		    $cnt++;
2575		    next FOO;
2576		} else {
2577		    my $tmp = pack("w", $leng);
2578		    my $data_i = "$tmp$record";
2579		    my $data_w = "$words[$cnt]\n";
2580		    print $fh_tmp_i $data_i;
2581		    print $fh_tmp_w $data_w;
2582		    print $fh_ii pack("N", $ptr_i);
2583		    print $fh_wi pack("N", $ptr_w);
2584		    $ptr_i += length($data_i);
2585		    $ptr_w += length($data_w);
2586		    $key_count++;
2587		}
2588	    }
2589	    my $tmp  = pack("w", $baseleng);
2590	    my $data_i = "$tmp$baserecord";
2591	    my $data_w = "$current_word\n";
2592	    print $fh_tmp_i $data_i;
2593	    print $fh_tmp_w $data_w;
2594	    print $fh_ii pack("N", $ptr_i);
2595	    print $fh_wi pack("N", $ptr_w);
2596	    $ptr_i += length($data_i);
2597	    $ptr_w += length($data_w);
2598	    $key_count++;
2599	}
2600    }
2601    while ($cnt < @words) {
2602	my $leng = length($KeyIndex{$words[$cnt]});
2603	my $tmp = pack("w", $leng);
2604	my $record = $KeyIndex{$words[$cnt]};
2605
2606	my $data_i = "$tmp$record";
2607	my $data_w = "$words[$cnt]\n";
2608	print $fh_tmp_i $data_i;
2609	print $fh_tmp_w $data_w;
2610	print $fh_ii pack("N", $ptr_i);
2611	print $fh_wi pack("N", $ptr_w);
2612	$ptr_i += length($data_i);
2613	$ptr_w += length($data_w);
2614	$key_count++;
2615	$cnt++;
2616    }
2617    %KeyIndex = ();
2618    %KeyIndexLast = ();
2619
2620    util::fclose($fh_wi);
2621    util::fclose($fh_w) if (defined $fh_w);
2622    util::fclose($fh_ii);
2623    util::fclose($fh_i) if (defined $fh_i);
2624    util::fclose($fh_tmp_w);
2625    util::fclose($fh_tmp_i);
2626
2627    return $key_count;
2628}
2629
2630#
2631# Decide the media type.
2632# FIXME: Very ad hoc. It's just a compromise. -- satoru
2633#
2634sub decide_type ($$) {
2635    my ($name, $cont) = @_;
2636    return $name if (!defined $cont || $name eq $cont);
2637
2638    util::dprint("decide_type: name: $name, cont: $cont\n");
2639    if ($cont =~ m!^text/plain! && $name =~ m!^text/plain!) {
2640	return $name;
2641    } elsif ($cont =~ m!^application/octet-stream! &&
2642             $name !~ m!^text/!) {
2643	return $name;
2644    } elsif ($cont =~ m!^application/(excel|powerpoint|msword)! &&
2645	     $name !~ m!^application/octet-stream!)  {
2646	# FIXME: Currently File::MMagic 1.02's checktype_data()
2647	# is unreliable for them.
2648	return $name;
2649    } elsif ($cont =~ m!^application/x-zip! &&
2650             $name =~ m!^application/!) {
2651        # zip format is used other applications e.g. OpenOffice.
2652        # It is necessary to add to check extention.
2653        return $name;
2654    }
2655
2656    return $cont;
2657}
2658
2659#
2660# Debugging code for the "negative numbers" problem.
2661#
2662sub check_records ($$$) {
2663    my ($recref, $baserecref, $step) = @_;
2664    dump_record($baserecref, $step);
2665    dump_record($recref, $step);
2666    print STDERR "The \x22negative number\x22 problem occurred.\n";
2667    exit(1);
2668}
2669
2670sub dump_record($$) {
2671    my ($recref, $step) = @_;
2672    my (@data) = unpack 'w*', $$recref;
2673    print STDERR "dump record data to NMZ.bug.info (step: $step)...";
2674    my $fh_info = util::fopen(">> NMZ.bug.info");
2675    print $fh_info "dumped record data (step: $step)...";
2676    foreach (@data) {
2677	print $fh_info sprintf(" %08x", $_);
2678    }
2679    print $fh_info "\n";
2680    util::fclose($fh_info);
2681    return;
2682}
2683
2684sub trapintr {
2685    my ($signame) = @_;
2686    print STDERR "Warning: signal $signame occured.\n";
2687}
2688
2689#
2690# For avoiding "used only once: possible typo at ..." warnings.
2691#
2692muda($conf::ON_MEMORY_MAX,
2693     $conf::WORD_LENG_MAX, $conf::TEXT_SIZE_MAX,
2694     $conf::DENY_FILE, $var::INTSIZE,
2695     $conf::CHASEN_NOUN, $conf::CHASEN,
2696     $conf::KAKASI, $var::Opt{'okurigana'},
2697     $var::Opt{'hiragana'}, $conf::DIRECTORY_INDEX,
2698     $usage::USAGE, $var::Opt{'noheadabst'}, $usage::VERSION_INFO,
2699     $var::Opt{'noencodeurl'}, $conf::HTML_SUFFIX,
2700     $var::RECURSIVE_ACTIONS, $conf::META_TAGS, $var::USE_NKF_MODULE,
2701     $conf::ADDRESS, $var::MAILING_ADDRESS,
2702     $conf::FILE_SIZE_MAX,
2703     $conf::MECAB,
2704     $conf::DENY_DDN,
2705     $var::TRAC_URI,
2706     );
2707
2708sub muda {}
2709
2710