1package KinoSearch1::Index::SegTermDocs; 2use strict; 3use warnings; 4use KinoSearch1::Util::ToolSet; 5use base qw( KinoSearch1::Index::TermDocs ); 6 7BEGIN { 8 __PACKAGE__->init_instance_vars( 9 # constructor params 10 reader => undef, 11 ); 12} 13our %instance_vars; 14 15sub new { 16 my $self = shift->SUPER::new; 17 confess kerror() unless verify_args( \%instance_vars, @_ ); 18 my %args = ( %instance_vars, @_ ); 19 my $reader = $args{reader}; 20 21 _init_child($self); 22 23 # dupe some stuff from the parent reader. 24 $self->_set_reader($reader); 25 $self->_set_skip_interval( $reader->get_skip_interval ); 26 $self->_set_freq_stream( $reader->get_freq_stream()->clone_stream ); 27 $self->_set_skip_stream( $reader->get_freq_stream()->clone_stream ); 28 $self->_set_prox_stream( $reader->get_prox_stream()->clone_stream ); 29 $self->_set_deldocs( $reader->get_deldocs ); 30 31 return $self; 32} 33 34sub seek { 35 my ( $self, $term ) = @_; 36 my $tinfo 37 = defined $term 38 ? $self->_get_reader()->fetch_term_info($term) 39 : undef; 40 $self->seek_tinfo($tinfo); 41} 42 43sub close { 44 my $self = shift; 45 $self->_get_freq_stream()->close; 46 $self->_get_prox_stream()->close; 47 $self->_get_skip_stream()->close; 48} 49 501; 51 52__END__ 53__XS__ 54 55MODULE = KinoSearch1 PACKAGE = KinoSearch1::Index::SegTermDocs 56 57void 58_init_child(term_docs) 59 TermDocs *term_docs; 60PPCODE: 61 Kino1_SegTermDocs_init_child(term_docs); 62 63SV* 64_set_or_get(term_docs, ...) 65 TermDocs *term_docs; 66ALIAS: 67 _set_count = 1 68 _get_count = 2 69 _set_freq_stream = 3 70 _get_freq_stream = 4 71 _set_prox_stream = 5 72 _get_prox_stream = 6 73 _set_skip_stream = 7 74 _get_skip_stream = 8 75 _set_deldocs = 9 76 _get_deldocs = 10 77 _set_reader = 11 78 _get_reader = 12 79 set_read_positions = 13 80 get_read_positions = 14 81 _set_skip_interval = 15 82 _get_skip_interval = 16 83CODE: 84{ 85 SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child; 86 87 KINO_START_SET_OR_GET_SWITCH 88 89 case 1: child->count = SvUV(ST(1)); 90 /* fall through */ 91 case 2: RETVAL = newSVuv(child->count); 92 break; 93 94 case 3: SvREFCNT_dec(child->freq_stream_sv); 95 child->freq_stream_sv = newSVsv( ST(1) ); 96 Kino1_extract_struct( child->freq_stream_sv, child->freq_stream, 97 InStream*, "KinoSearch1::Store::InStream"); 98 /* fall through */ 99 case 4: RETVAL = newSVsv(child->freq_stream_sv); 100 break; 101 102 case 5: SvREFCNT_dec(child->prox_stream_sv); 103 child->prox_stream_sv = newSVsv( ST(1) ); 104 Kino1_extract_struct( child->prox_stream_sv, child->prox_stream, 105 InStream*, "KinoSearch1::Store::InStream"); 106 /* fall through */ 107 case 6: RETVAL = newSVsv(child->prox_stream_sv); 108 break; 109 110 case 7: SvREFCNT_dec(child->skip_stream_sv); 111 child->skip_stream_sv = newSVsv( ST(1) ); 112 Kino1_extract_struct( child->skip_stream_sv, child->skip_stream, 113 InStream*, "KinoSearch1::Store::InStream"); 114 /* fall through */ 115 case 8: RETVAL = newSVsv(child->skip_stream_sv); 116 break; 117 118 case 9: SvREFCNT_dec(child->deldocs_sv); 119 child->deldocs_sv = newSVsv( ST(1) ); 120 Kino1_extract_struct( child->deldocs_sv, child->deldocs, 121 BitVector*, "KinoSearch1::Index::DelDocs" ); 122 /* fall through */ 123 case 10: RETVAL = newSVsv(child->deldocs_sv); 124 break; 125 126 case 11: SvREFCNT_dec(child->reader_sv); 127 if (!sv_derived_from( ST(1), "KinoSearch1::Index::IndexReader") ) 128 Kino1_confess("not a KinoSearch1::Index::IndexReader"); 129 child->reader_sv = newSVsv( ST(1) ); 130 /* fall through */ 131 case 12: RETVAL = newSVsv(child->reader_sv); 132 break; 133 134 case 13: child->read_positions = SvTRUE( ST(1) ) ? 1 : 0; 135 /* fall through */ 136 case 14: RETVAL = newSViv(child->read_positions); 137 break; 138 139 case 15: child->skip_interval = SvUV(ST(1)); 140 /* fall through */ 141 case 16: RETVAL = newSVuv(child->skip_interval); 142 break; 143 144 KINO_END_SET_OR_GET_SWITCH 145} 146OUTPUT: RETVAL 147 148__H__ 149 150#ifndef H_KINO_SEG_TERM_DOCS 151#define H_KINO_SEG_TERM_DOCS 1 152 153#include "EXTERN.h" 154#include "perl.h" 155#include "XSUB.h" 156#include "KinoSearch1UtilBitVector.h" 157#include "KinoSearch1IndexTermDocs.h" 158#include "KinoSearch1IndexTermInfo.h" 159#include "KinoSearch1StoreInStream.h" 160#include "KinoSearch1UtilMemManager.h" 161 162typedef struct segtermdocschild { 163 U32 count; 164 U32 doc_freq; 165 U32 doc; 166 U32 freq; 167 U32 skip_doc; 168 U32 skip_count; 169 U32 num_skips; 170 SV *positions; 171 U32 read_positions; 172 U32 skip_interval; 173 InStream *freq_stream; 174 InStream *prox_stream; 175 InStream *skip_stream; 176 bool have_skipped; 177 double frq_fileptr; 178 double prx_fileptr; 179 double skip_fileptr; 180 BitVector *deldocs; 181 SV *freq_stream_sv; 182 SV *prox_stream_sv; 183 SV *skip_stream_sv; 184 SV *deldocs_sv; 185 SV *reader_sv; 186} SegTermDocsChild; 187 188void Kino1_SegTermDocs_init_child(TermDocs*); 189void Kino1_SegTermDocs_set_doc_freq(TermDocs*, U32); 190U32 Kino1_SegTermDocs_get_doc_freq(TermDocs*); 191U32 Kino1_SegTermDocs_get_doc(TermDocs*); 192U32 Kino1_SegTermDocs_get_freq(TermDocs*); 193SV* Kino1_SegTermDocs_get_positions(TermDocs*); 194U32 Kino1_SegTermDocs_bulk_read(TermDocs*, SV*, SV*, U32); 195void Kino1_SegTermDocs_seek_tinfo(TermDocs*, TermInfo*); 196bool Kino1_SegTermDocs_next(TermDocs*); 197bool Kino1_SegTermDocs_skip_to(TermDocs*, U32 target); 198bool Kino1_SegTermDocs_skip_to_with_positions(TermDocs*); 199void Kino1_SegTermDocs_destroy(TermDocs*); 200 201#endif /* include guard */ 202 203__C__ 204 205#include "KinoSearch1IndexSegTermDocs.h" 206 207static void 208load_positions(TermDocs *term_docs); 209 210void 211Kino1_SegTermDocs_init_child(TermDocs *term_docs) { 212 SegTermDocsChild *child; 213 214 Kino1_New(1, child, 1, SegTermDocsChild); 215 term_docs->child = child; 216 217 child->doc_freq = KINO_TERM_DOCS_SENTINEL; 218 child->doc = KINO_TERM_DOCS_SENTINEL; 219 child->freq = KINO_TERM_DOCS_SENTINEL; 220 221 /* child->positions starts life as an empty string */ 222 child->positions = newSV(1); 223 SvCUR_set(child->positions, 0); 224 SvPOK_on(child->positions); 225 226 term_docs->set_doc_freq = Kino1_SegTermDocs_set_doc_freq; 227 term_docs->get_doc_freq = Kino1_SegTermDocs_get_doc_freq; 228 term_docs->get_doc = Kino1_SegTermDocs_get_doc; 229 term_docs->get_freq = Kino1_SegTermDocs_get_freq; 230 term_docs->get_positions = Kino1_SegTermDocs_get_positions; 231 term_docs->bulk_read = Kino1_SegTermDocs_bulk_read; 232 term_docs->seek_tinfo = Kino1_SegTermDocs_seek_tinfo; 233 term_docs->next = Kino1_SegTermDocs_next; 234 term_docs->skip_to = Kino1_SegTermDocs_skip_to; 235 term_docs->destroy = Kino1_SegTermDocs_destroy; 236 237 child->freq_stream_sv = &PL_sv_undef; 238 child->prox_stream_sv = &PL_sv_undef; 239 child->skip_stream_sv = &PL_sv_undef; 240 child->deldocs_sv = &PL_sv_undef; 241 child->reader_sv = &PL_sv_undef; 242 child->count = 0; 243 244 child->read_positions = 0; /* off by default */ 245} 246 247void 248Kino1_SegTermDocs_set_doc_freq(TermDocs *term_docs, U32 doc_freq) { 249 SegTermDocsChild *child; 250 child = (SegTermDocsChild*)term_docs->child; 251 child->doc_freq = doc_freq; 252} 253 254U32 255Kino1_SegTermDocs_get_doc_freq(TermDocs *term_docs) { 256 SegTermDocsChild *child; 257 child = (SegTermDocsChild*)term_docs->child; 258 return child->doc_freq; 259} 260 261U32 262Kino1_SegTermDocs_get_doc(TermDocs *term_docs) { 263 SegTermDocsChild *child; 264 child = (SegTermDocsChild*)term_docs->child; 265 return child->doc; 266} 267 268 269U32 270Kino1_SegTermDocs_get_freq(TermDocs *term_docs) { 271 SegTermDocsChild *child; 272 child = (SegTermDocsChild*)term_docs->child; 273 return child->freq; 274} 275 276SV* 277Kino1_SegTermDocs_get_positions(TermDocs *term_docs) { 278 SegTermDocsChild *child; 279 child = (SegTermDocsChild*)term_docs->child; 280 return child->positions; 281} 282 283U32 284Kino1_SegTermDocs_bulk_read(TermDocs *term_docs, SV* doc_nums_sv, 285 SV* freqs_sv, U32 num_wanted) { 286 SegTermDocsChild *child; 287 InStream *freq_stream; 288 U32 doc_code; 289 U32 *doc_nums; 290 U32 *freqs; 291 STRLEN len; 292 U32 num_got = 0; 293 294 /* local copies */ 295 child = (SegTermDocsChild*)term_docs->child; 296 freq_stream = child->freq_stream; 297 298 /* allocate space in supplied SVs and make them POK, if necessary */ 299 len = num_wanted * sizeof(U32); 300 SvUPGRADE(doc_nums_sv, SVt_PV); 301 SvUPGRADE(freqs_sv, SVt_PV); 302 SvPOK_on(doc_nums_sv); 303 SvPOK_on(freqs_sv); 304 doc_nums = (U32*)SvGROW(doc_nums_sv, len + 1); 305 freqs = (U32*)SvGROW(freqs_sv, len + 1); 306 307 while (child->count < child->doc_freq && num_got < num_wanted) { 308 /* manually inlined call to term_docs->next */ 309 child->count++; 310 doc_code = freq_stream->read_vint(freq_stream);; 311 child->doc += doc_code >> 1; 312 if (doc_code & 1) 313 child->freq = 1; 314 else 315 child->freq = freq_stream->read_vint(freq_stream); 316 317 /* if the doc isn't deleted... */ 318 if ( !Kino1_BitVec_get(child->deldocs, child->doc) ) { 319 /* ... append to results */ 320 *doc_nums++ = child->doc; 321 *freqs++ = child->freq; 322 num_got++; 323 } 324 } 325 326 /* set the string end to the end of the U32 array */ 327 SvCUR_set(doc_nums_sv, (num_got * sizeof(U32))); 328 SvCUR_set(freqs_sv, (num_got * sizeof(U32))); 329 330 return num_got; 331} 332 333bool 334Kino1_SegTermDocs_next(TermDocs *term_docs) { 335 SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child; 336 InStream *freq_stream = child->freq_stream; 337 U32 doc_code; 338 339 while (1) { 340 /* bail if we're out of docs */ 341 if (child->count == child->doc_freq) { 342 return 0; 343 } 344 345 /* decode delta doc */ 346 doc_code = freq_stream->read_vint(freq_stream); 347 child->doc += doc_code >> 1; 348 349 /* if the stored num was odd, the freq is 1 */ 350 if (doc_code & 1) { 351 child->freq = 1; 352 } 353 /* otherwise, freq was stored as a VInt. */ 354 else { 355 child->freq = freq_stream->read_vint(freq_stream); 356 } 357 358 child->count++; 359 360 /* read positions if desired */ 361 if (child->read_positions) 362 load_positions(term_docs); 363 364 /* if the doc isn't deleted... success! */ 365 if (!Kino1_BitVec_get(child->deldocs, child->doc)) 366 break; 367 } 368 return 1; 369} 370 371static void 372load_positions(TermDocs *term_docs) { 373 SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child; 374 InStream *prox_stream = child->prox_stream; 375 STRLEN len = child->freq * sizeof(U32); 376 U32 *positions, *positions_end; 377 U32 position = 0; 378 379 SvGROW( child->positions, len ); 380 SvCUR_set(child->positions, len); 381 positions = (U32*)SvPVX(child->positions); 382 positions_end = (U32*)SvEND(child->positions); 383 while (positions < positions_end) { 384 position += prox_stream->read_vint(prox_stream); 385 *positions++ = position; 386 } 387} 388 389void 390Kino1_SegTermDocs_seek_tinfo(TermDocs *term_docs, TermInfo *tinfo) { 391 SegTermDocsChild *child; 392 child = (SegTermDocsChild*)term_docs->child; 393 394 child->count = 0; 395 396 if (tinfo == NULL) { 397 child->doc_freq = 0; 398 } 399 else { 400 child->doc = 0; 401 child->freq = 0; 402 child->skip_doc = 0; 403 child->skip_count = 0; 404 child->have_skipped = FALSE; 405 child->num_skips = tinfo->doc_freq / child->skip_interval; 406 child->doc_freq = tinfo->doc_freq; 407 child->frq_fileptr = tinfo->frq_fileptr; 408 child->prx_fileptr = tinfo->prx_fileptr; 409 child->skip_fileptr = tinfo->frq_fileptr + tinfo->skip_offset; 410 child->freq_stream->seek( child->freq_stream, tinfo->frq_fileptr ); 411 child->prox_stream->seek( child->prox_stream, tinfo->prx_fileptr ); 412 } 413} 414 415bool 416Kino1_SegTermDocs_skip_to(TermDocs *term_docs, U32 target) { 417 SegTermDocsChild *child = (SegTermDocsChild*)term_docs->child; 418 419 if (child->doc_freq >= child->skip_interval) { 420 InStream *freq_stream = child->freq_stream; 421 InStream *prox_stream = child->prox_stream; 422 InStream *skip_stream = child->skip_stream; 423 U32 last_skip_doc = child->skip_doc; 424 double last_frq_fileptr = freq_stream->tell(freq_stream); 425 double last_prx_fileptr = -1; 426 I32 num_skipped = -1 - (child->count % child->skip_interval); 427 428 if (!child->have_skipped) { 429 child->skip_stream->seek(child->skip_stream, child->skip_fileptr); 430 child->have_skipped = TRUE; 431 } 432 433 while (target > child->skip_doc) { 434 last_skip_doc = child->skip_doc; 435 last_frq_fileptr = child->frq_fileptr; 436 last_prx_fileptr = child->prx_fileptr; 437 438 if (child->skip_doc != 0 && child->skip_doc >= child->doc) { 439 num_skipped += child->skip_interval; 440 } 441 442 if (child->skip_count >= child->num_skips) { 443 break; 444 } 445 446 child->skip_doc += skip_stream->read_vint(skip_stream); 447 child->frq_fileptr += skip_stream->read_vint(skip_stream); 448 child->prx_fileptr += skip_stream->read_vint(skip_stream); 449 450 child->skip_count++; 451 } 452 453 /* if there's something to skip, skip it */ 454 if (last_frq_fileptr > freq_stream->tell(freq_stream)) { 455 freq_stream->seek(freq_stream, last_frq_fileptr); 456 if (child->read_positions) { 457 prox_stream->seek(prox_stream, last_prx_fileptr); 458 } 459 child->doc = last_skip_doc; 460 child->count += num_skipped; 461 } 462 } 463 464 /* done skipping, so scan */ 465 do { 466 if (!term_docs->next(term_docs)) { 467 return FALSE; 468 } 469 } while (target > child->doc); 470 return TRUE; 471} 472 473void 474Kino1_SegTermDocs_destroy(TermDocs *term_docs){ 475 SegTermDocsChild *child; 476 child = (SegTermDocsChild*)term_docs->child; 477 478 SvREFCNT_dec(child->positions); 479 SvREFCNT_dec(child->freq_stream_sv); 480 SvREFCNT_dec(child->prox_stream_sv); 481 SvREFCNT_dec(child->skip_stream_sv); 482 SvREFCNT_dec(child->deldocs_sv); 483 SvREFCNT_dec(child->reader_sv); 484 485 Kino1_Safefree(child); 486 487 Kino1_TermDocs_destroy(term_docs); 488} 489 490__POD__ 491 492==begin devdocs 493 494==head1 NAME 495 496KinoSearch1::Index::SegTermDocs - single-segment TermDocs 497 498==head1 DESCRIPTION 499 500Single-segment implemetation of KinoSearch1::Index::TermDocs. 501 502==head1 COPYRIGHT 503 504Copyright 2005-2010 Marvin Humphrey 505 506==head1 LICENSE, DISCLAIMER, BUGS, etc. 507 508See L<KinoSearch1> version 1.01. 509 510==end devdocs 511==cut 512