1use strict;
2use warnings;
3
4package KinoSearch1::Test::TestUtils;
5use base qw( Exporter );
6
7our @EXPORT_OK = qw(
8    working_dir
9    create_working_dir
10    remove_working_dir
11    create_index
12    create_persistent_test_index
13    test_index_loc
14    persistent_test_index_loc
15    init_test_index_loc
16    get_uscon_docs
17    utf8_test_strings
18    test_analyzer
19);
20
21use KinoSearch1::InvIndexer;
22use KinoSearch1::Store::RAMInvIndex;
23use KinoSearch1::Store::FSInvIndex;
24use KinoSearch1::Analysis::Tokenizer;
25use KinoSearch1::Analysis::TokenBatch;
26use KinoSearch1::Analysis::PolyAnalyzer;
27
28use File::Spec::Functions qw( catdir catfile curdir );
29use Encode qw( _utf8_off );
30use File::Path qw( rmtree );
31use Carp;
32
33my $working_dir = catfile( curdir(), 'kinosearch_test' );
34
35# Return a directory within the system's temp directory where we will put all
36# testing scratch files.
37sub working_dir {$working_dir}
38
39sub create_working_dir {
40    mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!";
41}
42
43# Verify that this user owns the working dir, then zap it.  Returns true upon
44# success.
45sub remove_working_dir {
46    return unless -d $working_dir;
47    rmtree $working_dir;
48    return 1;
49}
50
51# Return a location for a test index to be used by a single test file.  If
52# the test file crashes it cannot clean up after itself, so we put the cleanup
53# routine in a single test file to be run at or near the end of the test
54# suite.
55sub test_index_loc {
56    return catdir( $working_dir, 'test_index' );
57}
58
59# Return a location for a test index intended to be shared by multiple test
60# files.  It will be cleaned as above.
61sub persistent_test_index_loc {
62    return catdir( $working_dir, 'persistent_test_index' );
63}
64
65# Destroy anything left over in the test_index location, then create the
66# directory.  Finally, return the path.
67sub init_test_index_loc {
68    my $dir = test_index_loc();
69    rmtree $dir;
70    die "Can't clean up '$dir'" if -e $dir;
71    mkdir $dir or die "Can't mkdir '$dir': $!";
72    return $dir;
73}
74
75# Build a RAM index, using the supplied array of strings as source material.
76# The index will have a single field: "content".
77sub create_index {
78    my @docs = @_;
79
80    my $tokenizer  = KinoSearch1::Analysis::Tokenizer->new;
81    my $invindex   = KinoSearch1::Store::RAMInvIndex->new;
82    my $invindexer = KinoSearch1::InvIndexer->new(
83        invindex => $invindex,
84        analyzer => $tokenizer,
85        create   => 1,
86    );
87
88    $invindexer->spec_field( name => 'content' );
89
90    for (@docs) {
91        my $doc = $invindexer->new_doc;
92        $doc->set_value( content => $_ );
93        $invindexer->add_doc($doc);
94    }
95
96    $invindexer->finish;
97
98    return $invindex;
99}
100
101# Slurp us constitition docs and build hashrefs.
102sub get_uscon_docs {
103
104    my $uscon_dir = catdir( 't', 'us_constitution' );
105    opendir( my $uscon_dh, $uscon_dir )
106        or die "couldn't opendir '$uscon_dir': $!";
107    my @filenames = grep {/\.html$/} sort readdir $uscon_dh;
108    closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!";
109
110    my %docs;
111
112    for my $filename (@filenames) {
113        next if $filename eq 'index.html';
114        my $filepath = catfile( $uscon_dir, $filename );
115        open( my $fh, '<', $filepath )
116            or die "couldn't open file '$filepath': $!";
117        my $content = do { local $/; <$fh> };
118        $content =~ m#<title>(.*?)</title>#s
119            or die "couldn't isolate title in '$filepath'";
120        my $title = $1;
121        $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s
122            or die "couldn't isolate bodytext in '$filepath'";
123        my $bodytext = $1;
124        $bodytext =~ s/<.*?>//sg;
125        $bodytext =~ s/\s+/ /sg;
126
127        $docs{$filename} = {
128            title    => $title,
129            bodytext => $bodytext,
130            url      => "/us_constitution/$filename",
131        };
132    }
133
134    return \%docs;
135}
136
137sub create_persistent_test_index {
138    my $invindexer;
139    my $polyanalyzer
140        = KinoSearch1::Analysis::PolyAnalyzer->new( language => 'en' );
141
142    $invindexer = KinoSearch1::InvIndexer->new(
143        invindex => persistent_test_index_loc(),
144        create   => 1,
145        analyzer => $polyanalyzer,
146    );
147    $invindexer->spec_field( name => 'content' );
148    for ( 0 .. 10000 ) {
149        my $doc = $invindexer->new_doc;
150        $doc->set_value( content => "zz$_" );
151        $invindexer->add_doc($doc);
152    }
153    $invindexer->finish;
154    undef $invindexer;
155
156    $invindexer = KinoSearch1::InvIndexer->new(
157        invindex => persistent_test_index_loc(),
158        analyzer => $polyanalyzer,
159    );
160    $invindexer->spec_field( name => 'content' );
161    my $source_docs = get_uscon_docs();
162    for ( values %$source_docs ) {
163        my $doc = $invindexer->new_doc;
164        $doc->set_value( content => $_->{bodytext} );
165        $invindexer->add_doc($doc);
166    }
167    $invindexer->finish;
168    undef $invindexer;
169
170    $invindexer = KinoSearch1::InvIndexer->new(
171        invindex => persistent_test_index_loc(),
172        analyzer => $polyanalyzer,
173    );
174    $invindexer->spec_field( name => 'content' );
175    my @chars = ( 'a' .. 'z' );
176    for ( 0 .. 1000 ) {
177        my $content = '';
178        for my $num_words ( 1 .. int( rand(20) ) ) {
179            for ( 1 .. ( int( rand(10) ) + 10 ) ) {
180                $content .= @chars[ rand(@chars) ];
181            }
182            $content .= ' ';
183        }
184        my $doc = $invindexer->new_doc;
185        $doc->set_value( content => $content );
186        $invindexer->add_doc($doc);
187    }
188    $invindexer->finish( optimize => 1 );
189}
190
191# Return 3 strings useful for verifying UTF-8 integrity.
192sub utf8_test_strings {
193    my $smiley       = "\x{263a}";
194    my $not_a_smiley = $smiley;
195    _utf8_off($not_a_smiley);
196    my $frowny = $not_a_smiley;
197    utf8::upgrade($frowny);
198    return ( $smiley, $not_a_smiley, $frowny );
199}
200
201# Verify an Analyzer's analyze() method.
202sub test_analyzer {
203    my ( $analyzer, $source, $expected, $message ) = @_;
204
205    my $batch = KinoSearch1::Analysis::TokenBatch->new;
206    $batch->append( $source, 0, length($source) );
207
208    $batch = $analyzer->analyze($batch);
209    my @got;
210    while ( $batch->next ) {
211        push @got, $batch->get_text;
212    }
213    Test::More::is_deeply( \@got, $expected, "analyze: $message" );
214}
215
2161;
217
218__END__
219
220__COPYRIGHT__
221
222Copyright 2005-2010 Marvin Humphrey
223
224This program is free software; you can redistribute it and/or modify
225under the same terms as Perl itself.
226
227