1package KinoSearch1::Index::SegTermDocs;
2use strict;
3use warnings;
4use KinoSearch1::Util::ToolSet;
5use base qw( KinoSearch1::Index::TermDocs );
6
7BEGIN {
8    __PACKAGE__->init_instance_vars(
9        # constructor params
10        reader => undef,
11    );
12}
13our %instance_vars;
14
15sub new {
16    my $self = shift->SUPER::new;
17    confess kerror() unless verify_args( \%instance_vars, @_ );
18    my %args = ( %instance_vars, @_ );
19    my $reader = $args{reader};
20
21    _init_child($self);
22
23    # dupe some stuff from the parent reader.
24    $self->_set_reader($reader);
25    $self->_set_skip_interval( $reader->get_skip_interval );
26    $self->_set_freq_stream( $reader->get_freq_stream()->clone_stream );
27    $self->_set_skip_stream( $reader->get_freq_stream()->clone_stream );
28    $self->_set_prox_stream( $reader->get_prox_stream()->clone_stream );
29    $self->_set_deldocs( $reader->get_deldocs );
30
31    return $self;
32}
33
34sub seek {
35    my ( $self, $term ) = @_;
36    my $tinfo
37        = defined $term
38        ? $self->_get_reader()->fetch_term_info($term)
39        : undef;
40    $self->seek_tinfo($tinfo);
41}
42
43sub close {
44    my $self = shift;
45    $self->_get_freq_stream()->close;
46    $self->_get_prox_stream()->close;
47    $self->_get_skip_stream()->close;
48}
49
501;
51
52__END__
53__XS__
54
55MODULE = KinoSearch1    PACKAGE = KinoSearch1::Index::SegTermDocs
56
57void
58_init_child(term_docs)
59    TermDocs *term_docs;
60PPCODE:
61    Kino1_SegTermDocs_init_child(term_docs);
62
63SV*
64_set_or_get(term_docs, ...)
65    TermDocs *term_docs;
66ALIAS:
67    _set_count         = 1
68    _get_count         = 2
69    _set_freq_stream   = 3
70    _get_freq_stream   = 4
71    _set_prox_stream   = 5
72    _get_prox_stream   = 6
73    _set_skip_stream   = 7
74    _get_skip_stream   = 8
75    _set_deldocs       = 9
76    _get_deldocs       = 10
77    _set_reader        = 11
78    _get_reader        = 12
79    set_read_positions = 13
80    get_read_positions = 14
81    _set_skip_interval = 15
82    _get_skip_interval = 16
83CODE:
84{
85    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;
86
87    KINO_START_SET_OR_GET_SWITCH
88
89    case 1:  child->count = SvUV(ST(1));
90             /* fall through */
91    case 2:  RETVAL = newSVuv(child->count);
92             break;
93
94    case 3:  SvREFCNT_dec(child->freq_stream_sv);
95             child->freq_stream_sv = newSVsv( ST(1) );
96             Kino1_extract_struct( child->freq_stream_sv, child->freq_stream,
97                InStream*, "KinoSearch1::Store::InStream");
98             /* fall through */
99    case 4:  RETVAL = newSVsv(child->freq_stream_sv);
100             break;
101
102    case 5:  SvREFCNT_dec(child->prox_stream_sv);
103             child->prox_stream_sv = newSVsv( ST(1) );
104             Kino1_extract_struct( child->prox_stream_sv, child->prox_stream,
105                InStream*, "KinoSearch1::Store::InStream");
106             /* fall through */
107    case 6:  RETVAL = newSVsv(child->prox_stream_sv);
108             break;
109
110    case 7:  SvREFCNT_dec(child->skip_stream_sv);
111             child->skip_stream_sv = newSVsv( ST(1) );
112             Kino1_extract_struct( child->skip_stream_sv, child->skip_stream,
113                InStream*, "KinoSearch1::Store::InStream");
114             /* fall through */
115    case 8:  RETVAL = newSVsv(child->skip_stream_sv);
116             break;
117
118    case 9:  SvREFCNT_dec(child->deldocs_sv);
119             child->deldocs_sv = newSVsv( ST(1) );
120             Kino1_extract_struct( child->deldocs_sv, child->deldocs,
121                BitVector*, "KinoSearch1::Index::DelDocs" );
122             /* fall through */
123    case 10: RETVAL = newSVsv(child->deldocs_sv);
124             break;
125
126    case 11: SvREFCNT_dec(child->reader_sv);
127             if (!sv_derived_from( ST(1), "KinoSearch1::Index::IndexReader") )
128                Kino1_confess("not a KinoSearch1::Index::IndexReader");
129             child->reader_sv = newSVsv( ST(1) );
130             /* fall through */
131    case 12: RETVAL = newSVsv(child->reader_sv);
132             break;
133
134    case 13: child->read_positions = SvTRUE( ST(1) ) ? 1 : 0;
135             /* fall through */
136    case 14: RETVAL = newSViv(child->read_positions);
137             break;
138
139    case 15: child->skip_interval = SvUV(ST(1));
140             /* fall through */
141    case 16: RETVAL = newSVuv(child->skip_interval);
142             break;
143
144    KINO_END_SET_OR_GET_SWITCH
145}
146OUTPUT: RETVAL
147
148__H__
149
150#ifndef H_KINO_SEG_TERM_DOCS
151#define H_KINO_SEG_TERM_DOCS 1
152
153#include "EXTERN.h"
154#include "perl.h"
155#include "XSUB.h"
156#include "KinoSearch1UtilBitVector.h"
157#include "KinoSearch1IndexTermDocs.h"
158#include "KinoSearch1IndexTermInfo.h"
159#include "KinoSearch1StoreInStream.h"
160#include "KinoSearch1UtilMemManager.h"
161
162typedef struct segtermdocschild {
163    U32        count;
164    U32        doc_freq;
165    U32        doc;
166    U32        freq;
167    U32        skip_doc;
168    U32        skip_count;
169    U32        num_skips;
170    SV        *positions;
171    U32        read_positions;
172    U32        skip_interval;
173    InStream  *freq_stream;
174    InStream  *prox_stream;
175    InStream  *skip_stream;
176    bool       have_skipped;
177    double     frq_fileptr;
178    double     prx_fileptr;
179    double     skip_fileptr;
180    BitVector *deldocs;
181    SV        *freq_stream_sv;
182    SV        *prox_stream_sv;
183    SV        *skip_stream_sv;
184    SV        *deldocs_sv;
185    SV        *reader_sv;
186} SegTermDocsChild;
187
188void Kino1_SegTermDocs_init_child(TermDocs*);
189void Kino1_SegTermDocs_set_doc_freq(TermDocs*, U32);
190U32  Kino1_SegTermDocs_get_doc_freq(TermDocs*);
191U32  Kino1_SegTermDocs_get_doc(TermDocs*);
192U32  Kino1_SegTermDocs_get_freq(TermDocs*);
193SV*  Kino1_SegTermDocs_get_positions(TermDocs*);
194U32  Kino1_SegTermDocs_bulk_read(TermDocs*, SV*, SV*, U32);
195void Kino1_SegTermDocs_seek_tinfo(TermDocs*, TermInfo*);
196bool Kino1_SegTermDocs_next(TermDocs*);
197bool Kino1_SegTermDocs_skip_to(TermDocs*, U32 target);
198bool Kino1_SegTermDocs_skip_to_with_positions(TermDocs*);
199void Kino1_SegTermDocs_destroy(TermDocs*);
200
201#endif /* include guard */
202
203__C__
204
205#include "KinoSearch1IndexSegTermDocs.h"
206
207static void
208load_positions(TermDocs *term_docs);
209
210void
211Kino1_SegTermDocs_init_child(TermDocs *term_docs) {
212    SegTermDocsChild *child;
213
214    Kino1_New(1, child, 1, SegTermDocsChild);
215    term_docs->child = child;
216
217    child->doc_freq = KINO_TERM_DOCS_SENTINEL;
218    child->doc      = KINO_TERM_DOCS_SENTINEL;
219    child->freq     = KINO_TERM_DOCS_SENTINEL;
220
221    /* child->positions starts life as an empty string */
222    child->positions = newSV(1);
223    SvCUR_set(child->positions, 0);
224    SvPOK_on(child->positions);
225
226    term_docs->set_doc_freq  = Kino1_SegTermDocs_set_doc_freq;
227    term_docs->get_doc_freq  = Kino1_SegTermDocs_get_doc_freq;
228    term_docs->get_doc       = Kino1_SegTermDocs_get_doc;
229    term_docs->get_freq      = Kino1_SegTermDocs_get_freq;
230    term_docs->get_positions = Kino1_SegTermDocs_get_positions;
231    term_docs->bulk_read     = Kino1_SegTermDocs_bulk_read;
232    term_docs->seek_tinfo    = Kino1_SegTermDocs_seek_tinfo;
233    term_docs->next          = Kino1_SegTermDocs_next;
234    term_docs->skip_to       = Kino1_SegTermDocs_skip_to;
235    term_docs->destroy       = Kino1_SegTermDocs_destroy;
236
237    child->freq_stream_sv   = &PL_sv_undef;
238    child->prox_stream_sv   = &PL_sv_undef;
239    child->skip_stream_sv   = &PL_sv_undef;
240    child->deldocs_sv       = &PL_sv_undef;
241    child->reader_sv        = &PL_sv_undef;
242    child->count            = 0;
243
244    child->read_positions = 0; /* off by default */
245}
246
247void
248Kino1_SegTermDocs_set_doc_freq(TermDocs *term_docs, U32 doc_freq) {
249    SegTermDocsChild *child;
250    child = (SegTermDocsChild*)term_docs->child;
251    child->doc_freq = doc_freq;
252}
253
254U32
255Kino1_SegTermDocs_get_doc_freq(TermDocs *term_docs) {
256    SegTermDocsChild *child;
257    child = (SegTermDocsChild*)term_docs->child;
258    return child->doc_freq;
259}
260
261U32
262Kino1_SegTermDocs_get_doc(TermDocs *term_docs) {
263    SegTermDocsChild *child;
264    child = (SegTermDocsChild*)term_docs->child;
265    return child->doc;
266}
267
268
269U32
270Kino1_SegTermDocs_get_freq(TermDocs *term_docs) {
271    SegTermDocsChild *child;
272    child = (SegTermDocsChild*)term_docs->child;
273    return child->freq;
274}
275
276SV*
277Kino1_SegTermDocs_get_positions(TermDocs *term_docs) {
278    SegTermDocsChild *child;
279    child = (SegTermDocsChild*)term_docs->child;
280    return child->positions;
281}
282
283U32
284Kino1_SegTermDocs_bulk_read(TermDocs *term_docs, SV* doc_nums_sv,
285                           SV* freqs_sv, U32 num_wanted) {
286    SegTermDocsChild *child;
287    InStream         *freq_stream;
288    U32               doc_code;
289    U32              *doc_nums;
290    U32              *freqs;
291    STRLEN            len;
292    U32               num_got = 0;
293
294    /* local copies */
295    child       = (SegTermDocsChild*)term_docs->child;
296    freq_stream = child->freq_stream;
297
298    /* allocate space in supplied SVs and make them POK, if necessary */
299    len = num_wanted * sizeof(U32);
300    SvUPGRADE(doc_nums_sv, SVt_PV);
301    SvUPGRADE(freqs_sv,    SVt_PV);
302    SvPOK_on(doc_nums_sv);
303    SvPOK_on(freqs_sv);
304    doc_nums = (U32*)SvGROW(doc_nums_sv, len + 1);
305    freqs    = (U32*)SvGROW(freqs_sv,    len + 1);
306
307    while (child->count < child->doc_freq && num_got < num_wanted) {
308        /* manually inlined call to term_docs->next */
309        child->count++;
310        doc_code = freq_stream->read_vint(freq_stream);;
311        child->doc  += doc_code >> 1;
312        if (doc_code & 1)
313            child->freq = 1;
314        else
315            child->freq = freq_stream->read_vint(freq_stream);
316
317        /* if the doc isn't deleted... */
318        if ( !Kino1_BitVec_get(child->deldocs, child->doc) ) {
319            /* ... append to results */
320            *doc_nums++ = child->doc;
321            *freqs++    = child->freq;
322            num_got++;
323        }
324    }
325
326    /* set the string end to the end of the U32 array */
327    SvCUR_set(doc_nums_sv, (num_got * sizeof(U32)));
328    SvCUR_set(freqs_sv,    (num_got * sizeof(U32)));
329
330    return num_got;
331}
332
333bool
334Kino1_SegTermDocs_next(TermDocs *term_docs) {
335    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;
336    InStream         *freq_stream = child->freq_stream;
337    U32               doc_code;
338
339    while (1) {
340        /* bail if we're out of docs */
341        if (child->count == child->doc_freq) {
342            return 0;
343        }
344
345        /* decode delta doc */
346        doc_code = freq_stream->read_vint(freq_stream);
347        child->doc  += doc_code >> 1;
348
349        /* if the stored num was odd, the freq is 1 */
350        if (doc_code & 1) {
351            child->freq = 1;
352        }
353        /* otherwise, freq was stored as a VInt. */
354        else {
355            child->freq = freq_stream->read_vint(freq_stream);
356        }
357
358        child->count++;
359
360        /* read positions if desired */
361        if (child->read_positions)
362            load_positions(term_docs);
363
364        /* if the doc isn't deleted... success! */
365        if (!Kino1_BitVec_get(child->deldocs, child->doc))
366            break;
367    }
368    return 1;
369}
370
371static void
372load_positions(TermDocs *term_docs) {
373    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;
374    InStream *prox_stream = child->prox_stream;
375    STRLEN len = child->freq * sizeof(U32);
376    U32 *positions, *positions_end;
377    U32 position = 0;
378
379    SvGROW( child->positions, len );
380    SvCUR_set(child->positions, len);
381    positions = (U32*)SvPVX(child->positions);
382    positions_end = (U32*)SvEND(child->positions);
383    while (positions < positions_end) {
384        position += prox_stream->read_vint(prox_stream);
385        *positions++ = position;
386    }
387}
388
389void
390Kino1_SegTermDocs_seek_tinfo(TermDocs *term_docs, TermInfo *tinfo) {
391    SegTermDocsChild *child;
392    child = (SegTermDocsChild*)term_docs->child;
393
394    child->count = 0;
395
396    if (tinfo == NULL) {
397        child->doc_freq = 0;
398    }
399    else {
400        child->doc          = 0;
401        child->freq         = 0;
402        child->skip_doc     = 0;
403        child->skip_count   = 0;
404        child->have_skipped = FALSE;
405        child->num_skips    = tinfo->doc_freq / child->skip_interval;
406        child->doc_freq     = tinfo->doc_freq;
407        child->frq_fileptr  = tinfo->frq_fileptr;
408        child->prx_fileptr  = tinfo->prx_fileptr;
409        child->skip_fileptr = tinfo->frq_fileptr + tinfo->skip_offset;
410        child->freq_stream->seek( child->freq_stream, tinfo->frq_fileptr );
411        child->prox_stream->seek( child->prox_stream, tinfo->prx_fileptr );
412    }
413}
414
415bool
416Kino1_SegTermDocs_skip_to(TermDocs *term_docs, U32 target) {
417    SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child;
418
419    if (child->doc_freq >= child->skip_interval) {
420        InStream *freq_stream   = child->freq_stream;
421        InStream *prox_stream   = child->prox_stream;
422        InStream *skip_stream   = child->skip_stream;
423        U32 last_skip_doc       = child->skip_doc;
424        double last_frq_fileptr = freq_stream->tell(freq_stream);
425        double last_prx_fileptr = -1;
426        I32 num_skipped         = -1 - (child->count % child->skip_interval);
427
428        if (!child->have_skipped) {
429            child->skip_stream->seek(child->skip_stream, child->skip_fileptr);
430            child->have_skipped = TRUE;
431        }
432
433        while (target > child->skip_doc) {
434            last_skip_doc    = child->skip_doc;
435            last_frq_fileptr = child->frq_fileptr;
436            last_prx_fileptr = child->prx_fileptr;
437
438            if (child->skip_doc != 0 && child->skip_doc >= child->doc) {
439                num_skipped += child->skip_interval;
440            }
441
442            if (child->skip_count >= child->num_skips) {
443                break;
444            }
445
446            child->skip_doc += skip_stream->read_vint(skip_stream);
447            child->frq_fileptr += skip_stream->read_vint(skip_stream);
448            child->prx_fileptr += skip_stream->read_vint(skip_stream);
449
450            child->skip_count++;
451        }
452
453        /* if there's something to skip, skip it */
454        if (last_frq_fileptr > freq_stream->tell(freq_stream)) {
455            freq_stream->seek(freq_stream, last_frq_fileptr);
456            if (child->read_positions) {
457                prox_stream->seek(prox_stream, last_prx_fileptr);
458            }
459            child->doc = last_skip_doc;
460            child->count += num_skipped;
461        }
462    }
463
464    /* done skipping, so scan */
465    do {
466        if (!term_docs->next(term_docs)) {
467            return FALSE;
468        }
469    } while (target > child->doc);
470    return TRUE;
471}
472
473void
474Kino1_SegTermDocs_destroy(TermDocs *term_docs){
475    SegTermDocsChild *child;
476    child = (SegTermDocsChild*)term_docs->child;
477
478    SvREFCNT_dec(child->positions);
479    SvREFCNT_dec(child->freq_stream_sv);
480    SvREFCNT_dec(child->prox_stream_sv);
481    SvREFCNT_dec(child->skip_stream_sv);
482    SvREFCNT_dec(child->deldocs_sv);
483    SvREFCNT_dec(child->reader_sv);
484
485    Kino1_Safefree(child);
486
487    Kino1_TermDocs_destroy(term_docs);
488}
489
490__POD__
491
492==begin devdocs
493
494==head1 NAME
495
496KinoSearch1::Index::SegTermDocs - single-segment TermDocs
497
498==head1 DESCRIPTION
499
500Single-segment implemetation of KinoSearch1::Index::TermDocs.
501
502==head1 COPYRIGHT
503
504Copyright 2005-2010 Marvin Humphrey
505
506==head1 LICENSE, DISCLAIMER, BUGS, etc.
507
508See L<KinoSearch1> version 1.01.
509
510==end devdocs
511==cut
512