1package KinoSearch1::Store::InStream; 2use base qw( KinoSearch1::Util::CClass ); 3use strict; 4use warnings; 5use KinoSearch1::Util::ToolSet; 6 7sub close { CORE::close shift->get_fh } 8 9=for comment 10Dupe the filehandle and create a new object around the dupe. Seek the dupe 11to the same spot as the original. 12 13=cut 14 15sub clone_stream { 16 my $self = shift; 17 open( my $duped_fh, '<&=', $self->get_fh ) 18 or confess("Couldn't dupe filehandle: $!"); 19 my $evil_twin 20 = __PACKAGE__->new( $duped_fh, $self->get_offset, $self->length, ); 21 $evil_twin->seek( $self->tell ); 22 return $evil_twin; 23} 24 251; 26 27__END__ 28 29__XS__ 30 31MODULE = KinoSearch1 PACKAGE = KinoSearch1::Store::InStream 32 33=begin comment 34 35 my $instream = KinoSearch1::Store::Instream->new( 36 $filehandle, $offset, $length 37 ); 38 39Constructor. Takes 1-3 arguments, and unlike most classes in the KinoSearch1 40suite, the arguments to the constructor are not labeled parameters. 41 42The second argument, an offset, defaults to 0 if not supplied. Non-zero 43offsets get factored in when calling seek and tell. 44 45The last argument, a length, is the length of the "file" in bytes. Supplying 46an explicit value is only essential for InStreams which are assigned to read a 47portion of a compound file -- otherwise, the length gets auto-calculated 48correctly. 49 50=end comment 51=cut 52 53InStream* 54new(class, fh_sv, ...) 55 char *class; 56 SV *fh_sv; 57PREINIT: 58 double offset = 0; 59 double len = -1; 60CODE: 61 if (items > 2) { 62 SV* offset_sv; 63 offset_sv = ST(2); 64 if (SvOK(offset_sv)) 65 offset = SvNV(offset_sv); 66 } 67 if (items > 3) { 68 SV *len_sv; 69 len_sv = ST(3); 70 if (SvOK(len_sv)) 71 len = SvNV(len_sv); 72 } 73 RETVAL = Kino1_InStream_new(class, fh_sv, offset, len); 74OUTPUT: RETVAL 75 76 77=for comment 78Seek to target plus the object's start offset. 79 80=cut 81 82void 83seek(instream, target) 84 InStream *instream; 85 double target; 86PPCODE: 87 instream->seek(instream, target); 88 89=for comment 90Return the filehandle's position minus the offset. 91 92=cut 93 94double 95tell(instream) 96 InStream *instream; 97CODE: 98 RETVAL = instream->tell(instream); 99OUTPUT: RETVAL 100 101=for comment 102Return the length of the "file" in bytes, factoring in the offset. 103 104=cut 105 106double 107length(instream) 108 InStream *instream; 109CODE: 110 RETVAL = instream->len; 111OUTPUT: RETVAL 112 113=begin comment 114 115 @items = $instream->lu_read( TEMPLATE ); 116 117Read the items specified by TEMPLATE from the InStream. 118 119=end comment 120=cut 121 122SV* 123_set_or_get(instream, ...) 124 InStream *instream; 125ALIAS: 126 set_len = 1 127 get_len = 2 128 set_offset = 3 129 get_offset = 4 130 set_fh = 5 131 get_fh = 6 132CODE: 133{ 134 KINO_START_SET_OR_GET_SWITCH 135 136 case 1: instream->len = SvNV( ST(1) ); 137 /* fall through */ 138 case 2: RETVAL = newSVnv(instream->len); 139 break; 140 141 case 3: instream->offset = SvNV( ST(1) ); 142 /* fall through */ 143 case 4: RETVAL = newSVnv(instream->offset); 144 break; 145 146 case 5: Kino1_confess("Can't set_fh"); 147 /* fall through */ 148 case 6: RETVAL = newSVsv(instream->fh_sv); 149 break; 150 151 KINO_END_SET_OR_GET_SWITCH 152} 153OUTPUT: RETVAL 154 155 156void 157lu_read (instream, template_sv) 158 InStream *instream; 159 SV *template_sv 160PREINIT: 161 STRLEN tpt_len; /* bytelength of template */ 162 char *template; /* ptr to a spot in the template */ 163 char *tpt_end; /* ptr to the end of the template */ 164 int repeat_count; /* number of times to repeat sym */ 165 char sym; /* the current symbol in the template */ 166 char countsym; /* used when calculating repeat counts */ 167 IV aIV; 168 SV *aSV; 169 char aChar; 170 char* string; 171 STRLEN len; 172PPCODE: 173{ 174 /* prepare template string pointers */ 175 template = SvPV(template_sv, tpt_len); 176 tpt_end = SvEND(template_sv); 177 178 repeat_count = 0; 179 while (1) { 180 if (repeat_count == 0) { 181 /* fast-forward past space characters */ 182 while (*template == ' ' && template < tpt_end) { 183 template++; 184 } 185 186 /* break out of the loop if we've exhausted the template */ 187 if (template == tpt_end) { 188 break; 189 } 190 191 /* derive the current symbol and a possible digit repeat sym */ 192 sym = *template++; 193 countsym = *template; 194 195 if (template == tpt_end) { 196 /* sym is last char in template, so process once */ 197 repeat_count = 1; 198 } 199 else if (countsym >= '0' && countsym <= '9') { 200 /* calculate numerical repeat count */ 201 repeat_count = countsym - KINO_NUM_CHAR_OFFSET; 202 countsym = *(++template); 203 while ( template <= tpt_end 204 && countsym >= '0' 205 && countsym <= '9' 206 ) { 207 repeat_count = (repeat_count * 10) 208 + (countsym - KINO_NUM_CHAR_OFFSET); 209 countsym = *(++template); 210 } 211 } 212 else { /* no numeric repeat count, so process sym only once */ 213 repeat_count = 1; 214 } 215 } 216 217 /* thwart potential infinite loop */ 218 if (repeat_count < 1) 219 Kino1_confess( "invalid repeat_count: %d", repeat_count); 220 221 switch(sym) { 222 223 case 'a': /* arbitrary binary data */ 224 len = repeat_count; 225 repeat_count = 1; 226 aSV = newSV(len + 1); 227 SvCUR_set(aSV, len); 228 SvPOK_on(aSV); 229 string = SvPVX(aSV); 230 instream->read_bytes(instream, string, len); 231 break; 232 233 case 'b': /* signed byte */ 234 case 'B': /* unsigned byte */ 235 aChar = instream->read_byte(instream); 236 if (sym == 'b') 237 aIV = (signed char)aChar; 238 else 239 aIV = (unsigned char)aChar; 240 aSV = newSViv(aIV); 241 break; 242 243 case 'i': /* signed 32-bit integer */ 244 aSV = newSViv( (I32)instream->read_int(instream) ); 245 break; 246 247 case 'I': /* unsigned 32-bit integer */ 248 aSV = newSVuv( instream->read_int(instream) ); 249 break; 250 251 case 'Q': /* unsigned "64-bit integer" */ 252 aSV = newSVnv( instream->read_long(instream) ); 253 break; 254 255 case 'T': /* string */ 256 len = instream->read_vint(instream); 257 aSV = newSV(len + 1); 258 SvCUR_set(aSV, len); 259 SvPOK_on(aSV); 260 string = SvPVX(aSV); 261 instream->read_chars(instream, string, 0, len); 262 break; 263 264 case 'V': /* VInt */ 265 aSV = newSVuv( instream->read_vint(instream) ); 266 break; 267 268 case 'W': /* VLong */ 269 aSV = newSVnv( instream->read_vlong(instream) ); 270 break; 271 272 default: 273 aSV = NULL; /* suppress unused var compiler warning */ 274 Kino1_confess("Invalid type in template: '%c'", sym); 275 } 276 277 /* Put a scalar on the stack, use up one symbol or repeater */ 278 XPUSHs( sv_2mortal(aSV) ); 279 repeat_count -= 1; 280 } 281} 282 283void 284DESTROY(instream) 285 InStream *instream; 286PPCODE: 287 Kino1_InStream_destroy(instream); 288 289__H__ 290 291 292#ifndef H_KINOSEARCH_STORE_INSTREAM 293#define H_KINOSEARCH_STORE_INSTREAM 1 294 295#include "EXTERN.h" 296#include "perl.h" 297#include "XSUB.h" 298#include "KinoSearch1UtilCarp.h" 299#include "KinoSearch1UtilMathUtils.h" 300 301/* Detect whether we're on an ASCII or EBCDIC machine. */ 302#if '0' == 240 303#define KINO_NUM_CHAR_OFFSET 240 304#else 305#define KINO_NUM_CHAR_OFFSET 48 306#endif 307 308#define KINO_IO_STREAM_BUF_SIZE 1024 309 310typedef struct instream { 311 PerlIO *fh; 312 SV *fh_sv; 313 double offset; 314 double len; 315 char *buf; 316 Off_t buf_start; /* file position of start of buffer */ 317 int buf_len; /* number of valid bytes in the buffer */ 318 int buf_pos; /* next byte to read */ 319 void (*seek)(struct instream*, double); 320 double (*tell)(struct instream*); 321 char (*read_byte)(struct instream*); 322 void (*read_bytes)(struct instream*, char*, STRLEN); 323 void (*read_chars)(struct instream*, char*, STRLEN, STRLEN); 324 U32 (*read_int)(struct instream*); 325 double (*read_long)(struct instream*); 326 U32 (*read_vint)(struct instream*); 327 double (*read_vlong)(struct instream*); 328} InStream; 329 330InStream* Kino1_InStream_new (char*, SV*, double, double); 331void Kino1_InStream_seek (InStream*, double); 332double Kino1_InStream_tell (InStream*); 333void Kino1_InStream_refill (InStream*); 334char Kino1_InStream_read_byte (InStream*); 335void Kino1_InStream_read_bytes (InStream*, char*, STRLEN); 336void Kino1_InStream_read_chars (InStream*, char*, STRLEN, STRLEN); 337U32 Kino1_InStream_read_int (InStream*); 338double Kino1_InStream_read_long (InStream*); 339U32 Kino1_InStream_decode_vint(char**); 340U32 Kino1_InStream_read_vint (InStream*); 341double Kino1_InStream_read_vlong (InStream*); 342void Kino1_InStream_destroy (InStream*); 343 344#endif /* include guard */ 345 346__C__ 347 348#include "KinoSearch1StoreInStream.h" 349 350 351InStream* 352Kino1_InStream_new(char *class, SV *fh_sv, double offset, double len ) { 353 InStream *instream; 354 355 /* allocate */ 356 Kino1_New(0, instream, 1, InStream); 357 358 /* assign */ 359 instream->fh_sv = newSVsv(fh_sv); 360 instream->fh = IoIFP( sv_2io(fh_sv) ); 361 instream->offset = offset; 362 363 /* init buffer */ 364 instream->buf = NULL; 365 instream->buf_start = 0; 366 instream->buf_len = 0; 367 instream->buf_pos = 0; 368 369 /* seek */ 370 if (offset != 0) { 371 PerlIO_seek(instream->fh, offset, 0); 372 } 373 374 /* calculate len if an (intentionally) invalid value was supplied */ 375 if (len < 0.0) { 376 double bookmark = PerlIO_tell(instream->fh); 377 PerlIO_seek(instream->fh, 0, 2); 378 len = PerlIO_tell(instream->fh); 379 PerlIO_seek(instream->fh, bookmark, 0); 380 } 381 instream->len = len; 382 383 /* assign methods */ 384 instream->seek = Kino1_InStream_seek; 385 instream->tell = Kino1_InStream_tell; 386 instream->read_byte = Kino1_InStream_read_byte; 387 instream->read_bytes = Kino1_InStream_read_bytes; 388 instream->read_chars = Kino1_InStream_read_chars; 389 instream->read_int = Kino1_InStream_read_int; 390 instream->read_long = Kino1_InStream_read_long; 391 instream->read_vint = Kino1_InStream_read_vint; 392 instream->read_vlong = Kino1_InStream_read_vlong; 393 394 return instream; 395} 396 397void 398Kino1_InStream_seek(InStream *instream, double target) { 399 /* seek within buffer if possible */ 400 if ( (target >= instream->buf_start) 401 && (target < (instream->buf_start + instream->buf_pos)) 402 ) { 403 instream->buf_pos = target - instream->buf_start; 404 } 405 /* nope, not possible, so seek within file and prepare to refill */ 406 else { 407 instream->buf_start = target; 408 instream->buf_pos = 0; 409 instream->buf_len = 0; 410 PerlIO_seek(instream->fh, target + instream->offset, 0); 411 } 412} 413 414double 415Kino1_InStream_tell(InStream *instream) { 416 return instream->buf_start + instream->buf_pos; 417} 418 419void 420Kino1_InStream_refill(InStream *instream) { 421 int check_val; 422 423 /* wait to allocate buffer until it's needed */ 424 if (instream->buf == NULL) 425 Kino1_New(0, instream->buf, KINO_IO_STREAM_BUF_SIZE, char); 426 427 /* add bytes read to file position, reset */ 428 instream->buf_start += instream->buf_pos; 429 instream->buf_pos = 0; 430 431 /* calculate the number of bytes to read */ 432 if (KINO_IO_STREAM_BUF_SIZE < instream->len - instream->buf_start) 433 instream->buf_len = KINO_IO_STREAM_BUF_SIZE; 434 else 435 instream->buf_len = instream->len - instream->buf_start; 436 437 /* perform the file operations */ 438 PerlIO_seek(instream->fh, 0, 1); 439 check_val = PerlIO_seek(instream->fh, 440 (instream->buf_start + instream->offset), 0); 441 if (check_val == -1) 442 Kino1_confess("refill: PerlIO_seek failed: %d", errno); 443 check_val = PerlIO_read(instream->fh, instream->buf, instream->buf_len); 444 if (check_val != instream->buf_len) 445 Kino1_confess("refill: tried to read %d bytes, got %d: %d", 446 instream->buf_len, check_val, errno); 447} 448 449char 450Kino1_InStream_read_byte(InStream *instream) { 451 if (instream->buf_pos >= instream->buf_len) 452 Kino1_InStream_refill(instream); 453 return instream->buf[ instream->buf_pos++ ]; 454} 455 456void 457Kino1_InStream_read_bytes (InStream *instream, char* buf, STRLEN len) { 458 if (instream->buf_pos + len < instream->buf_len) { 459 /* request is entirely within buffer, so copy */ 460 Copy((instream->buf + instream->buf_pos), buf, len, char); 461 instream->buf_pos += len; 462 } 463 else { 464 /* get the request from the file and reset buffer */ 465 int check_val; 466 Off_t start; 467 start = instream->tell(instream); 468 check_val = PerlIO_seek(instream->fh, (start + instream->offset), 0); 469 if (check_val == -1) 470 Kino1_confess("read_bytes: PerlIO_seek failed: %d", errno ); 471 check_val = PerlIO_read(instream->fh, buf, len); 472 if (check_val < len) 473 Kino1_confess("read_bytes: tried to read %"UVuf" bytes, got %d", 474 (UV)len, check_val); 475 476 /* reset vars and refill if there's more in the file */ 477 instream->buf_start = start + len; 478 instream->buf_pos = 0; 479 instream->buf_len = 0; 480 if (instream->buf_start < instream->len) 481 Kino1_InStream_refill(instream); 482 } 483} 484 485/* This is just a wrapper for read_bytes, but that may change. It should 486 * be used whenever Lucene character data is being read, typically after 487 * read_vint as part of a String read. If and when a change does come, it will 488 * be a lot easier to track down all the relevant code fragments if read_chars 489 * gets used consistently. 490 */ 491void 492Kino1_InStream_read_chars(InStream *instream, char *buf, STRLEN start, 493 STRLEN len) { 494 buf += start; 495 instream->read_bytes(instream, buf, len); 496} 497 498U32 499Kino1_InStream_read_int (InStream *instream) { 500 unsigned char buf[4]; 501 instream->read_bytes(instream, (char*)buf, 4); 502 return Kino1_decode_bigend_U32(buf); 503} 504 505double 506Kino1_InStream_read_long (InStream *instream) { 507 unsigned char buf[8]; 508 double aDouble; 509 510 /* get 8 bytes from the stream */ 511 instream->read_bytes(instream, (char*)buf, 8); 512 513 /* get high 4 bytes, multiply by 2**32 */ 514 aDouble = Kino1_decode_bigend_U32(buf); 515 aDouble = aDouble * pow(2.0, 32.0); 516 517 /* decode low four bytes as unsigned int and add to total */ 518 aDouble += Kino1_decode_bigend_U32(&buf[4]); 519 520 return aDouble; 521} 522 523/* read in a Variable INTeger, stored in 1-5 bytes */ 524U32 525Kino1_InStream_read_vint (InStream *instream) { 526 unsigned char aUChar; 527 int bitshift; 528 U32 aU32; 529 530 /* start by reading one byte; use the lower 7 bits */ 531 aUChar = (unsigned char)instream->read_byte(instream); 532 aU32 = aUChar & 0x7f; 533 534 /* keep reading and shifting as long as the high bit is set */ 535 for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) { 536 aUChar = (unsigned char)instream->read_byte(instream); 537 aU32 |= (aUChar & 0x7f) << bitshift; 538 } 539 return aU32; 540} 541 542U32 543Kino1_InStream_decode_vint(char **source_ptr) { 544 char *source; 545 int bitshift; 546 U32 aU32; 547 548 source = *source_ptr; 549 aU32 = (unsigned char)*source & 0x7f; 550 for (bitshift = 7; (*source & 0x80) != 0; bitshift += 7) { 551 source++; 552 aU32 |= ((unsigned char)*source & 0x7f) << bitshift; 553 } 554 source++; 555 *source_ptr = source; 556 return aU32; 557} 558 559double 560Kino1_InStream_read_vlong (InStream *instream) { 561 unsigned char aUChar; 562 int bitshift; 563 double aDouble; 564 565 aUChar = (unsigned char)instream->read_byte(instream); 566 aDouble = aUChar & 0x7f; 567 for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) { 568 aUChar = (unsigned char)instream->read_byte(instream); 569 aDouble += (aUChar & 0x7f) * pow(2, bitshift); 570 } 571 return aDouble; 572} 573 574 575void 576Kino1_InStream_destroy(InStream* instream) { 577 SvREFCNT_dec(instream->fh_sv); 578 Kino1_Safefree(instream->buf); 579 Kino1_Safefree(instream); 580} 581 582__POD__ 583 584==begin devdocs 585 586==head1 NAME 587 588KinoSearch1::Store::InStream - filehandles for reading invindexes 589 590==head1 SYNOPSIS 591 592 # isa blessed filehandle 593 594 my $instream = $invindex->open_instream( $filehandle, $offset, $length ); 595 my @ten_vints = $instream->lu_read('V10'); 596 597==head1 DESCRIPTION 598 599The InStream class abstracts out all input operations to KinoSearch1. 600 601InStream is implemented as a inside-out object around a blessed filehandle. 602It would almost be possible to use an ordinary filehandle, but the 603objectification is necessary because InStreams have to be capable of 604pretending that they are acting upon a distinct file when in reality they may 605be reading only a portion of a compound file. 606 607For the template used by lu_read, see InStream's companion, 608L<OutStream|KinoSearch1::Store::OutStream>. 609 610==head1 COPYRIGHT 611 612Copyright 2005-2010 Marvin Humphrey 613 614==head1 LICENSE, DISCLAIMER, BUGS, etc. 615 616See L<KinoSearch1> version 1.01. 617 618==end devdocs 619==cut 620 621