1use strict; 2use warnings; 3 4package KinoSearch1::Test::TestUtils; 5use base qw( Exporter ); 6 7our @EXPORT_OK = qw( 8 working_dir 9 create_working_dir 10 remove_working_dir 11 create_index 12 create_persistent_test_index 13 test_index_loc 14 persistent_test_index_loc 15 init_test_index_loc 16 get_uscon_docs 17 utf8_test_strings 18 test_analyzer 19); 20 21use KinoSearch1::InvIndexer; 22use KinoSearch1::Store::RAMInvIndex; 23use KinoSearch1::Store::FSInvIndex; 24use KinoSearch1::Analysis::Tokenizer; 25use KinoSearch1::Analysis::TokenBatch; 26use KinoSearch1::Analysis::PolyAnalyzer; 27 28use File::Spec::Functions qw( catdir catfile curdir ); 29use Encode qw( _utf8_off ); 30use File::Path qw( rmtree ); 31use Carp; 32 33my $working_dir = catfile( curdir(), 'kinosearch_test' ); 34 35# Return a directory within the system's temp directory where we will put all 36# testing scratch files. 37sub working_dir {$working_dir} 38 39sub create_working_dir { 40 mkdir( $working_dir, 0700 ) or die "Can't mkdir '$working_dir': $!"; 41} 42 43# Verify that this user owns the working dir, then zap it. Returns true upon 44# success. 45sub remove_working_dir { 46 return unless -d $working_dir; 47 rmtree $working_dir; 48 return 1; 49} 50 51# Return a location for a test index to be used by a single test file. If 52# the test file crashes it cannot clean up after itself, so we put the cleanup 53# routine in a single test file to be run at or near the end of the test 54# suite. 55sub test_index_loc { 56 return catdir( $working_dir, 'test_index' ); 57} 58 59# Return a location for a test index intended to be shared by multiple test 60# files. It will be cleaned as above. 61sub persistent_test_index_loc { 62 return catdir( $working_dir, 'persistent_test_index' ); 63} 64 65# Destroy anything left over in the test_index location, then create the 66# directory. Finally, return the path. 67sub init_test_index_loc { 68 my $dir = test_index_loc(); 69 rmtree $dir; 70 die "Can't clean up '$dir'" if -e $dir; 71 mkdir $dir or die "Can't mkdir '$dir': $!"; 72 return $dir; 73} 74 75# Build a RAM index, using the supplied array of strings as source material. 76# The index will have a single field: "content". 77sub create_index { 78 my @docs = @_; 79 80 my $tokenizer = KinoSearch1::Analysis::Tokenizer->new; 81 my $invindex = KinoSearch1::Store::RAMInvIndex->new; 82 my $invindexer = KinoSearch1::InvIndexer->new( 83 invindex => $invindex, 84 analyzer => $tokenizer, 85 create => 1, 86 ); 87 88 $invindexer->spec_field( name => 'content' ); 89 90 for (@docs) { 91 my $doc = $invindexer->new_doc; 92 $doc->set_value( content => $_ ); 93 $invindexer->add_doc($doc); 94 } 95 96 $invindexer->finish; 97 98 return $invindex; 99} 100 101# Slurp us constitition docs and build hashrefs. 102sub get_uscon_docs { 103 104 my $uscon_dir = catdir( 't', 'us_constitution' ); 105 opendir( my $uscon_dh, $uscon_dir ) 106 or die "couldn't opendir '$uscon_dir': $!"; 107 my @filenames = grep {/\.html$/} sort readdir $uscon_dh; 108 closedir $uscon_dh or die "couldn't closedir '$uscon_dir': $!"; 109 110 my %docs; 111 112 for my $filename (@filenames) { 113 next if $filename eq 'index.html'; 114 my $filepath = catfile( $uscon_dir, $filename ); 115 open( my $fh, '<', $filepath ) 116 or die "couldn't open file '$filepath': $!"; 117 my $content = do { local $/; <$fh> }; 118 $content =~ m#<title>(.*?)</title>#s 119 or die "couldn't isolate title in '$filepath'"; 120 my $title = $1; 121 $content =~ m#<div id="bodytext">(.*?)</div><!--bodytext-->#s 122 or die "couldn't isolate bodytext in '$filepath'"; 123 my $bodytext = $1; 124 $bodytext =~ s/<.*?>//sg; 125 $bodytext =~ s/\s+/ /sg; 126 127 $docs{$filename} = { 128 title => $title, 129 bodytext => $bodytext, 130 url => "/us_constitution/$filename", 131 }; 132 } 133 134 return \%docs; 135} 136 137sub create_persistent_test_index { 138 my $invindexer; 139 my $polyanalyzer 140 = KinoSearch1::Analysis::PolyAnalyzer->new( language => 'en' ); 141 142 $invindexer = KinoSearch1::InvIndexer->new( 143 invindex => persistent_test_index_loc(), 144 create => 1, 145 analyzer => $polyanalyzer, 146 ); 147 $invindexer->spec_field( name => 'content' ); 148 for ( 0 .. 10000 ) { 149 my $doc = $invindexer->new_doc; 150 $doc->set_value( content => "zz$_" ); 151 $invindexer->add_doc($doc); 152 } 153 $invindexer->finish; 154 undef $invindexer; 155 156 $invindexer = KinoSearch1::InvIndexer->new( 157 invindex => persistent_test_index_loc(), 158 analyzer => $polyanalyzer, 159 ); 160 $invindexer->spec_field( name => 'content' ); 161 my $source_docs = get_uscon_docs(); 162 for ( values %$source_docs ) { 163 my $doc = $invindexer->new_doc; 164 $doc->set_value( content => $_->{bodytext} ); 165 $invindexer->add_doc($doc); 166 } 167 $invindexer->finish; 168 undef $invindexer; 169 170 $invindexer = KinoSearch1::InvIndexer->new( 171 invindex => persistent_test_index_loc(), 172 analyzer => $polyanalyzer, 173 ); 174 $invindexer->spec_field( name => 'content' ); 175 my @chars = ( 'a' .. 'z' ); 176 for ( 0 .. 1000 ) { 177 my $content = ''; 178 for my $num_words ( 1 .. int( rand(20) ) ) { 179 for ( 1 .. ( int( rand(10) ) + 10 ) ) { 180 $content .= @chars[ rand(@chars) ]; 181 } 182 $content .= ' '; 183 } 184 my $doc = $invindexer->new_doc; 185 $doc->set_value( content => $content ); 186 $invindexer->add_doc($doc); 187 } 188 $invindexer->finish( optimize => 1 ); 189} 190 191# Return 3 strings useful for verifying UTF-8 integrity. 192sub utf8_test_strings { 193 my $smiley = "\x{263a}"; 194 my $not_a_smiley = $smiley; 195 _utf8_off($not_a_smiley); 196 my $frowny = $not_a_smiley; 197 utf8::upgrade($frowny); 198 return ( $smiley, $not_a_smiley, $frowny ); 199} 200 201# Verify an Analyzer's analyze() method. 202sub test_analyzer { 203 my ( $analyzer, $source, $expected, $message ) = @_; 204 205 my $batch = KinoSearch1::Analysis::TokenBatch->new; 206 $batch->append( $source, 0, length($source) ); 207 208 $batch = $analyzer->analyze($batch); 209 my @got; 210 while ( $batch->next ) { 211 push @got, $batch->get_text; 212 } 213 Test::More::is_deeply( \@got, $expected, "analyze: $message" ); 214} 215 2161; 217 218__END__ 219 220__COPYRIGHT__ 221 222Copyright 2005-2010 Marvin Humphrey 223 224This program is free software; you can redistribute it and/or modify 225under the same terms as Perl itself. 226 227