1package KinoSearch1::Document::Field;
2use strict;
3use warnings;
4use KinoSearch1::Util::ToolSet;
5use base qw( KinoSearch1::Util::Class );
6
7BEGIN {
8    __PACKAGE__->init_instance_vars(
9        # constructor args / members
10        name       => undef,
11        analyzer   => undef,
12        boost      => 1,
13        stored     => 1,
14        indexed    => 1,
15        analyzed   => 1,
16        vectorized => 1,
17        binary     => 0,
18        compressed => 0,
19        omit_norms => 0,
20        field_num  => undef,
21        value      => '',
22        fnm_bits   => undef,
23        fdt_bits   => undef,
24        tv_string  => '',
25        tv_cache   => undef,
26    );
27    __PACKAGE__->ready_get_set(
28        qw(
29            value
30            tv_string
31            boost
32            indexed
33            stored
34            analyzed
35            vectorized
36            binary
37            compressed
38            analyzer
39            field_num
40            name
41            omit_norms
42            )
43    );
44}
45
46use KinoSearch1::Index::FieldsReader;
47use KinoSearch1::Index::FieldInfos;
48use KinoSearch1::Index::TermVector;
49
50use Storable qw( dclone );
51
52sub init_instance {
53    my $self = shift;
54
55    # field name is required
56    croak("Missing required parameter 'name'")
57        unless length $self->{name};
58
59    # don't index binary fields
60    if ( $self->{binary} ) {
61        $self->{indexed}  = 0;
62        $self->{analyzed} = 0;
63    }
64}
65
66sub clone {
67    my $self = shift;
68    return dclone($self);
69}
70
71# Given two Field objects, return a child which has all the positive
72# attributes of both parents (meaning: values are OR'd).
73sub breed_with {
74    my ( $self, $other ) = @_;
75    my $kid = $self->clone;
76    for (qw( indexed vectorized )) {
77        $kid->{$_} ||= $other->{$_};
78    }
79    return $kid;
80}
81
82sub set_fnm_bits { $_[0]->{fnm_bits} = $_[1] }
83
84sub get_fnm_bits {
85    my $self = shift;
86    $self->{fnm_bits} = KinoSearch1::Index::FieldInfos->encode_fnm_bits($self)
87        unless defined $self->{fnm_bits};
88    return $self->{fnm_bits};
89}
90
91sub set_fdt_bits { $_[0]->{fdt_bits} = $_[1] }
92
93sub get_fdt_bits {
94    my $self = shift;
95    $self->{fdt_bits}
96        = KinoSearch1::Index::FieldsReader->encode_fdt_bits($self)
97        unless defined $self->{fdt_bits};
98    return $self->{fdt_bits};
99}
100
101sub get_value_len { bytes::length( $_[0]->{value} ) }
102
103# Return a TermVector object for a given Term, if it's in this field.
104sub term_vector {
105    my ( $self, $term_text ) = @_;
106    return unless bytes::length( $self->{tv_string} );
107    if ( !defined $self->{tv_cache} ) {
108        $self->{tv_cache} = _extract_tv_cache( $self->{tv_string} );
109    }
110    if ( exists $self->{tv_cache}{$term_text} ) {
111        my ( $positions, $starts, $ends )
112            = _unpack_posdata( $self->{tv_cache}{$term_text} );
113        my $term_vector = KinoSearch1::Index::TermVector->new(
114            text          => $term_text,
115            field         => $self->{name},
116            positions     => $positions,
117            start_offsets => $starts,
118            end_offsets   => $ends,
119        );
120        return $term_vector;
121    }
122
123    return;
124}
125
1261;
127
128__END__
129
130__XS__
131
132MODULE = KinoSearch1    PACKAGE = KinoSearch1::Document::Field
133
134=for comment
135
136Return ref to a hash where the keys are term texts and the values are encoded
137positional data.
138
139=cut
140
141void
142_extract_tv_cache(tv_string_sv)
143    SV *tv_string_sv;
144PREINIT:
145    HV *tv_cache_hv;
146PPCODE:
147    tv_cache_hv = Kino1_Field_extract_tv_cache(tv_string_sv);
148    XPUSHs( sv_2mortal( newRV_noinc( (SV*)tv_cache_hv ) ) );
149    XSRETURN(1);
150
151=for comment
152
153Decompress positional data.
154
155=cut
156
157void
158_unpack_posdata(posdata_sv)
159    SV *posdata_sv;
160PREINIT:
161    AV     *positions_av, *starts_av, *ends_av;
162PPCODE:
163    positions_av = newAV();
164    starts_av    = newAV();
165    ends_av      = newAV();
166    Kino1_Field_unpack_posdata(posdata_sv, positions_av, starts_av, ends_av);
167    XPUSHs(sv_2mortal( newRV_noinc((SV*)positions_av) ));
168    XPUSHs(sv_2mortal( newRV_noinc((SV*)starts_av)    ));
169    XPUSHs(sv_2mortal( newRV_noinc((SV*)ends_av)      ));
170    XSRETURN(3);
171
172
173__H__
174
175#ifndef H_KINOSEARCH_FIELD
176#define H_KINOSEARCH_FIELD 1
177
178#include "EXTERN.h"
179#include "perl.h"
180#include "XSUB.h"
181#include "KinoSearch1StoreInStream.h"
182#include "KinoSearch1UtilCarp.h"
183
184HV*  Kino1_Field_extract_tv_cache(SV*);
185void Kino1_Field_unpack_posdata(SV*, AV*, AV*, AV*);
186
187#endif /* include guard */
188
189__C__
190
191#include "KinoSearch1DocumentField.h"
192
193HV*
194Kino1_Field_extract_tv_cache(SV *tv_string_sv) {
195    HV *tv_cache_hv;
196    char    *tv_string, *bookmark_ptr, *key;
197    char   **tv_ptr;
198    STRLEN   len, tv_len, overlap, key_len;
199    SV      *text_sv, *nums_sv;
200    I32      i, num_terms, num_positions;
201
202    /* allocate a new hash */
203    tv_cache_hv = newHV();
204
205    /* extract pointers */
206    tv_string = SvPV(tv_string_sv, tv_len);
207    tv_ptr    = &tv_string;
208
209    /* create a base text scalar */
210    text_sv = newSV(1);
211    SvPOK_on(text_sv);
212    *(SvEND(text_sv)) = '\0';
213
214    /* read the number of vectorized terms in the field */
215    num_terms = Kino1_InStream_decode_vint(tv_ptr);
216    for (i = 0; i < num_terms; i++) {
217
218        /* decompress the term text */
219        overlap = Kino1_InStream_decode_vint(tv_ptr);
220        SvCUR_set(text_sv, overlap);
221        len = Kino1_InStream_decode_vint(tv_ptr);
222        sv_catpvn(text_sv, *tv_ptr, len);
223        *tv_ptr += len;
224        key = SvPV(text_sv, key_len);
225
226        /* get positions & offsets string */
227        num_positions = Kino1_InStream_decode_vint(tv_ptr);
228        bookmark_ptr = *tv_ptr;
229        while(num_positions--) {
230            /* leave nums compressed to save a little mem */
231            (void)Kino1_InStream_decode_vint(tv_ptr);
232            (void)Kino1_InStream_decode_vint(tv_ptr);
233            (void)Kino1_InStream_decode_vint(tv_ptr);
234        }
235        len = *tv_ptr - bookmark_ptr;
236        nums_sv = newSVpvn(bookmark_ptr, len);
237
238        /* store the $text => $posdata pair in the output hash */
239        hv_store(tv_cache_hv, key, key_len, nums_sv, 0);
240    }
241    SvREFCNT_dec(text_sv);
242
243    return tv_cache_hv;
244}
245
246void
247Kino1_Field_unpack_posdata(SV *posdata_sv, AV *positions_av,
248                          AV *starts_av,  AV *ends_av) {
249    STRLEN  len;
250    char   *posdata, *posdata_end;
251    char  **posdata_ptr;
252    SV     *num_sv;
253    posdata      = SvPV(posdata_sv, len);
254    posdata_ptr  = &posdata;
255    posdata_end  = SvEND(posdata_sv);
256
257    /* translate encoded VInts to Perl scalars */
258    while(*posdata_ptr < posdata_end) {
259        num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) );
260        av_push(positions_av, num_sv);
261        num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) );
262        av_push(starts_av,    num_sv);
263        num_sv = newSViv( Kino1_InStream_decode_vint(posdata_ptr) );
264        av_push(ends_av,      num_sv);
265    }
266
267    if (*posdata_ptr != posdata_end)
268        Kino1_confess("Bad encoding of posdata");
269}
270
271__POD__
272
273=head1 NAME
274
275KinoSearch1::Document::Field - a field within a document
276
277=head1 SYNOPSIS
278
279    # no public interface
280
281=head1 DESCRIPTION
282
283Fields can only be defined or manipulated indirectly, via InvIndexer and Doc.
284
285=head1 COPYRIGHT
286
287Copyright 2005-2010 Marvin Humphrey
288
289=head1 LICENSE, DISCLAIMER, BUGS, etc.
290
291See L<KinoSearch1> version 1.01.
292
293=cut
294
295
296