1package KinoSearch1::Document::Field; 2use strict; 3use warnings; 4use KinoSearch1::Util::ToolSet; 5use base qw( KinoSearch1::Util::Class ); 6 7BEGIN { 8 __PACKAGE__->init_instance_vars( 9 # constructor args / members 10 name => undef, 11 analyzer => undef, 12 boost => 1, 13 stored => 1, 14 indexed => 1, 15 analyzed => 1, 16 vectorized => 1, 17 binary => 0, 18 compressed => 0, 19 omit_norms => 0, 20 field_num => undef, 21 value => '', 22 fnm_bits => undef, 23 fdt_bits => undef, 24 tv_string => '', 25 tv_cache => undef, 26 ); 27 __PACKAGE__->ready_get_set( 28 qw( 29 value 30 tv_string 31 boost 32 indexed 33 stored 34 analyzed 35 vectorized 36 binary 37 compressed 38 analyzer 39 field_num 40 name 41 omit_norms 42 ) 43 ); 44} 45 46use KinoSearch1::Index::FieldsReader; 47use KinoSearch1::Index::FieldInfos; 48use KinoSearch1::Index::TermVector; 49 50use Storable qw( dclone ); 51 52sub init_instance { 53 my $self = shift; 54 55 # field name is required 56 croak("Missing required parameter 'name'") 57 unless length $self->{name}; 58 59 # don't index binary fields 60 if ( $self->{binary} ) { 61 $self->{indexed} = 0; 62 $self->{analyzed} = 0; 63 } 64} 65 66sub clone { 67 my $self = shift; 68 return dclone($self); 69} 70 71# Given two Field objects, return a child which has all the positive 72# attributes of both parents (meaning: values are OR'd). 73sub breed_with { 74 my ( $self, $other ) = @_; 75 my $kid = $self->clone; 76 for (qw( indexed vectorized )) { 77 $kid->{$_} ||= $other->{$_}; 78 } 79 return $kid; 80} 81 82sub set_fnm_bits { $_[0]->{fnm_bits} = $_[1] } 83 84sub get_fnm_bits { 85 my $self = shift; 86 $self->{fnm_bits} = KinoSearch1::Index::FieldInfos->encode_fnm_bits($self) 87 unless defined $self->{fnm_bits}; 88 return $self->{fnm_bits}; 89} 90 91sub set_fdt_bits { $_[0]->{fdt_bits} = $_[1] } 92 93sub get_fdt_bits { 94 my $self = shift; 95 $self->{fdt_bits} 96 = KinoSearch1::Index::FieldsReader->encode_fdt_bits($self) 97 unless defined $self->{fdt_bits}; 98 return $self->{fdt_bits}; 99} 100 101sub get_value_len { bytes::length( $_[0]->{value} ) } 102 103# Return a TermVector object for a given Term, if it's in this field. 104sub term_vector { 105 my ( $self, $term_text ) = @_; 106 return unless bytes::length( $self->{tv_string} ); 107 if ( !defined $self->{tv_cache} ) { 108 $self->{tv_cache} = _extract_tv_cache( $self->{tv_string} ); 109 } 110 if ( exists $self->{tv_cache}{$term_text} ) { 111 my ( $positions, $starts, $ends ) 112 = _unpack_posdata( $self->{tv_cache}{$term_text} ); 113 my $term_vector = KinoSearch1::Index::TermVector->new( 114 text => $term_text, 115 field => $self->{name}, 116 positions => $positions, 117 start_offsets => $starts, 118 end_offsets => $ends, 119 ); 120 return $term_vector; 121 } 122 123 return; 124} 125 1261; 127 128__END__ 129 130__XS__ 131 132MODULE = KinoSearch1 PACKAGE = KinoSearch1::Document::Field 133 134=for comment 135 136Return ref to a hash where the keys are term texts and the values are encoded 137positional data. 138 139=cut 140 141void 142_extract_tv_cache(tv_string_sv) 143 SV *tv_string_sv; 144PREINIT: 145 HV *tv_cache_hv; 146PPCODE: 147 tv_cache_hv = Kino1_Field_extract_tv_cache(tv_string_sv); 148 XPUSHs( sv_2mortal( newRV_noinc( (SV*)tv_cache_hv ) ) ); 149 XSRETURN(1); 150 151=for comment 152 153Decompress positional data. 154 155=cut 156 157void 158_unpack_posdata(posdata_sv) 159 SV *posdata_sv; 160PREINIT: 161 AV *positions_av, *starts_av, *ends_av; 162PPCODE: 163 positions_av = newAV(); 164 starts_av = newAV(); 165 ends_av = newAV(); 166 Kino1_Field_unpack_posdata(posdata_sv, positions_av, starts_av, ends_av); 167 XPUSHs(sv_2mortal( newRV_noinc((SV*)positions_av) )); 168 XPUSHs(sv_2mortal( newRV_noinc((SV*)starts_av) )); 169 XPUSHs(sv_2mortal( newRV_noinc((SV*)ends_av) )); 170 XSRETURN(3); 171 172 173__H__ 174 175#ifndef H_KINOSEARCH_FIELD 176#define H_KINOSEARCH_FIELD 1 177 178#include "EXTERN.h" 179#include "perl.h" 180#include "XSUB.h" 181#include "KinoSearch1StoreInStream.h" 182#include "KinoSearch1UtilCarp.h" 183 184HV* Kino1_Field_extract_tv_cache(SV*); 185void Kino1_Field_unpack_posdata(SV*, AV*, AV*, AV*); 186 187#endif /* include guard */ 188 189__C__ 190 191#include "KinoSearch1DocumentField.h" 192 193HV* 194Kino1_Field_extract_tv_cache(SV *tv_string_sv) { 195 HV *tv_cache_hv; 196 char *tv_string, *bookmark_ptr, *key; 197 char **tv_ptr; 198 STRLEN len, tv_len, overlap, key_len; 199 SV *text_sv, *nums_sv; 200 I32 i, num_terms, num_positions; 201 202 /* allocate a new hash */ 203 tv_cache_hv = newHV(); 204 205 /* extract pointers */ 206 tv_string = SvPV(tv_string_sv, tv_len); 207 tv_ptr = &tv_string; 208 209 /* create a base text scalar */ 210 text_sv = newSV(1); 211 SvPOK_on(text_sv); 212 *(SvEND(text_sv)) = '\0'; 213 214 /* read the number of vectorized terms in the field */ 215 num_terms = Kino1_InStream_decode_vint(tv_ptr); 216 for (i = 0; i < num_terms; i++) { 217 218 /* decompress the term text */ 219 overlap = Kino1_InStream_decode_vint(tv_ptr); 220 SvCUR_set(text_sv, overlap); 221 len = Kino1_InStream_decode_vint(tv_ptr); 222 sv_catpvn(text_sv, *tv_ptr, len); 223 *tv_ptr += len; 224 key = SvPV(text_sv, key_len); 225 226 /* get positions & offsets string */ 227 num_positions = Kino1_InStream_decode_vint(tv_ptr); 228 bookmark_ptr = *tv_ptr; 229 while(num_positions--) { 230 /* leave nums compressed to save a little mem */ 231 (void)Kino1_InStream_decode_vint(tv_ptr); 232 (void)Kino1_InStream_decode_vint(tv_ptr); 233 (void)Kino1_InStream_decode_vint(tv_ptr); 234 } 235 len = *tv_ptr - bookmark_ptr; 236 nums_sv = newSVpvn(bookmark_ptr, len); 237 238 /* store the $text => $posdata pair in the output hash */ 239 hv_store(tv_cache_hv, key, key_len, nums_sv, 0); 240 } 241 SvREFCNT_dec(text_sv); 242 243 return tv_cache_hv; 244} 245 246void 247Kino1_Field_unpack_posdata(SV *posdata_sv, AV *positions_av, 248 AV *starts_av, AV *ends_av) { 249 STRLEN len; 250 char *posdata, *posdata_end; 251 char **posdata_ptr; 252 SV *num_sv; 253 posdata = SvPV(posdata_sv, len); 254 posdata_ptr = &posdata; 255 posdata_end = SvEND(posdata_sv); 256 257 /* translate encoded VInts to Perl scalars */ 258 while(*posdata_ptr < posdata_end) { 259 num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) ); 260 av_push(positions_av, num_sv); 261 num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) ); 262 av_push(starts_av, num_sv); 263 num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) ); 264 av_push(ends_av, num_sv); 265 } 266 267 if (*posdata_ptr != posdata_end) 268 Kino1_confess("Bad encoding of posdata"); 269} 270 271__POD__ 272 273=head1 NAME 274 275KinoSearch1::Document::Field - a field within a document 276 277=head1 SYNOPSIS 278 279 # no public interface 280 281=head1 DESCRIPTION 282 283Fields can only be defined or manipulated indirectly, via InvIndexer and Doc. 284 285=head1 COPYRIGHT 286 287Copyright 2005-2010 Marvin Humphrey 288 289=head1 LICENSE, DISCLAIMER, BUGS, etc. 290 291See L<KinoSearch1> version 1.01. 292 293=cut 294 295 296