1package KinoSearch1::Store::InStream;
2use base qw( KinoSearch1::Util::CClass );
3use strict;
4use warnings;
5use KinoSearch1::Util::ToolSet;
6
7sub close { CORE::close shift->get_fh }
8
9=for comment
10Dupe the filehandle and create a new object around the dupe.  Seek the dupe
11to the same spot as the original.
12
13=cut
14
15sub clone_stream {
16    my $self = shift;
17    open( my $duped_fh, '<&=', $self->get_fh )
18        or confess("Couldn't dupe filehandle: $!");
19    my $evil_twin
20        = __PACKAGE__->new( $duped_fh, $self->get_offset, $self->length, );
21    $evil_twin->seek( $self->tell );
22    return $evil_twin;
23}
24
251;
26
27__END__
28
29__XS__
30
31MODULE = KinoSearch1    PACKAGE = KinoSearch1::Store::InStream
32
33=begin comment
34
35    my $instream = KinoSearch1::Store::Instream->new(
36        $filehandle, $offset, $length
37    );
38
39Constructor.  Takes 1-3 arguments, and unlike most classes in the KinoSearch1
40suite, the arguments to the constructor are not labeled parameters.
41
42The second argument, an offset, defaults to 0 if not supplied.  Non-zero
43offsets get factored in when calling seek and tell.
44
45The last argument, a length, is the length of the "file" in bytes.  Supplying
46an explicit value is only essential for InStreams which are assigned to read a
47portion of a compound file -- otherwise, the length gets auto-calculated
48correctly.
49
50=end comment
51=cut
52
53InStream*
54new(class, fh_sv, ...)
55    char   *class;
56    SV     *fh_sv;
57PREINIT:
58    double  offset = 0;
59    double  len    = -1;
60CODE:
61    if (items > 2) {
62        SV* offset_sv;
63        offset_sv = ST(2);
64        if (SvOK(offset_sv))
65            offset = SvNV(offset_sv);
66    }
67    if (items > 3) {
68        SV *len_sv;
69        len_sv = ST(3);
70        if (SvOK(len_sv))
71            len = SvNV(len_sv);
72    }
73    RETVAL = Kino1_InStream_new(class, fh_sv, offset, len);
74OUTPUT: RETVAL
75
76
77=for comment
78Seek to target plus the object's start offset.
79
80=cut
81
82void
83seek(instream, target)
84    InStream *instream;
85    double    target;
86PPCODE:
87    instream->seek(instream, target);
88
89=for comment
90Return the filehandle's position minus the offset.
91
92=cut
93
94double
95tell(instream)
96    InStream *instream;
97CODE:
98    RETVAL = instream->tell(instream);
99OUTPUT: RETVAL
100
101=for comment
102Return the length of the "file" in bytes, factoring in the offset.
103
104=cut
105
106double
107length(instream)
108    InStream *instream;
109CODE:
110    RETVAL = instream->len;
111OUTPUT: RETVAL
112
113=begin comment
114
115    @items = $instream->lu_read( TEMPLATE );
116
117Read the items specified by TEMPLATE from the InStream.
118
119=end comment
120=cut
121
122SV*
123_set_or_get(instream, ...)
124    InStream *instream;
125ALIAS:
126    set_len      = 1
127    get_len      = 2
128    set_offset   = 3
129    get_offset   = 4
130    set_fh       = 5
131    get_fh       = 6
132CODE:
133{
134    KINO_START_SET_OR_GET_SWITCH
135
136    case 1:  instream->len = SvNV( ST(1) );
137             /* fall through */
138    case 2:  RETVAL = newSVnv(instream->len);
139             break;
140
141    case 3:  instream->offset = SvNV( ST(1) );
142             /* fall through */
143    case 4:  RETVAL = newSVnv(instream->offset);
144             break;
145
146    case 5:  Kino1_confess("Can't set_fh");
147             /* fall through */
148    case 6:  RETVAL = newSVsv(instream->fh_sv);
149             break;
150
151    KINO_END_SET_OR_GET_SWITCH
152}
153OUTPUT: RETVAL
154
155
156void
157lu_read (instream, template_sv)
158    InStream *instream;
159    SV       *template_sv
160PREINIT:
161    STRLEN    tpt_len;      /* bytelength of template */
162    char     *template;     /* ptr to a spot in the template */
163    char     *tpt_end;      /* ptr to the end of the template */
164    int       repeat_count; /* number of times to repeat sym */
165    char      sym;          /* the current symbol in the template */
166    char      countsym;     /* used when calculating repeat counts */
167    IV        aIV;
168    SV       *aSV;
169    char      aChar;
170    char*     string;
171    STRLEN    len;
172PPCODE:
173{
174    /* prepare template string pointers */
175    template    = SvPV(template_sv, tpt_len);
176    tpt_end     = SvEND(template_sv);
177
178    repeat_count = 0;
179    while (1) {
180        if (repeat_count == 0) {
181            /* fast-forward past space characters */
182            while (*template == ' ' && template < tpt_end) {
183                template++;
184            }
185
186            /* break out of the loop if we've exhausted the template */
187            if (template == tpt_end) {
188                break;
189            }
190
191            /* derive the current symbol and a possible digit repeat sym */
192            sym      = *template++;
193            countsym = *template;
194
195            if (template == tpt_end) {
196                /* sym is last char in template, so process once */
197                repeat_count = 1;
198            }
199            else if (countsym >= '0' && countsym <= '9') {
200                /* calculate numerical repeat count */
201                repeat_count = countsym - KINO_NUM_CHAR_OFFSET;
202                countsym = *(++template);
203                while (  template <= tpt_end
204                      && countsym >= '0'
205                      && countsym <= '9'
206                ) {
207                    repeat_count = (repeat_count * 10)
208                        + (countsym - KINO_NUM_CHAR_OFFSET);
209                    countsym = *(++template);
210                }
211            }
212            else { /* no numeric repeat count, so process sym only once */
213                repeat_count = 1;
214            }
215        }
216
217        /* thwart potential infinite loop */
218        if (repeat_count < 1)
219            Kino1_confess( "invalid repeat_count: %d", repeat_count);
220
221        switch(sym) {
222
223        case 'a': /* arbitrary binary data */
224            len = repeat_count;
225            repeat_count = 1;
226            aSV = newSV(len + 1);
227            SvCUR_set(aSV, len);
228            SvPOK_on(aSV);
229            string = SvPVX(aSV);
230            instream->read_bytes(instream, string, len);
231            break;
232
233        case 'b': /* signed byte */
234        case 'B': /* unsigned byte */
235            aChar = instream->read_byte(instream);
236            if (sym == 'b')
237                aIV = (signed char)aChar;
238            else
239                aIV = (unsigned char)aChar;
240            aSV = newSViv(aIV);
241            break;
242
243        case 'i': /* signed 32-bit integer */
244            aSV = newSViv( (I32)instream->read_int(instream) );
245            break;
246
247        case 'I': /* unsigned 32-bit integer */
248            aSV = newSVuv( instream->read_int(instream) );
249            break;
250
251        case 'Q': /* unsigned "64-bit integer" */
252            aSV = newSVnv( instream->read_long(instream) );
253            break;
254
255        case 'T': /* string */
256            len = instream->read_vint(instream);
257            aSV = newSV(len + 1);
258            SvCUR_set(aSV, len);
259            SvPOK_on(aSV);
260            string = SvPVX(aSV);
261            instream->read_chars(instream, string, 0, len);
262            break;
263
264        case 'V': /* VInt */
265            aSV = newSVuv( instream->read_vint(instream) );
266            break;
267
268        case 'W': /* VLong */
269            aSV = newSVnv( instream->read_vlong(instream) );
270            break;
271
272        default:
273            aSV = NULL; /* suppress unused var compiler warning */
274            Kino1_confess("Invalid type in template: '%c'", sym);
275        }
276
277        /* Put a scalar on the stack, use up one symbol or repeater */
278        XPUSHs( sv_2mortal(aSV) );
279        repeat_count -= 1;
280    }
281}
282
283void
284DESTROY(instream)
285    InStream *instream;
286PPCODE:
287    Kino1_InStream_destroy(instream);
288
289__H__
290
291
292#ifndef H_KINOSEARCH_STORE_INSTREAM
293#define H_KINOSEARCH_STORE_INSTREAM 1
294
295#include "EXTERN.h"
296#include "perl.h"
297#include "XSUB.h"
298#include "KinoSearch1UtilCarp.h"
299#include "KinoSearch1UtilMathUtils.h"
300
301/* Detect whether we're on an ASCII or EBCDIC machine. */
302#if '0' == 240
303#define KINO_NUM_CHAR_OFFSET 240
304#else
305#define KINO_NUM_CHAR_OFFSET 48
306#endif
307
308#define KINO_IO_STREAM_BUF_SIZE 1024
309
310typedef struct instream {
311    PerlIO  *fh;
312    SV      *fh_sv;
313    double   offset;
314    double   len;
315    char    *buf;
316    Off_t    buf_start;    /* file position of start of buffer */
317    int      buf_len;      /* number of valid bytes in the buffer */
318    int      buf_pos;      /* next byte to read */
319    void   (*seek)(struct instream*, double);
320    double (*tell)(struct instream*);
321    char   (*read_byte)(struct instream*);
322    void   (*read_bytes)(struct instream*, char*, STRLEN);
323    void   (*read_chars)(struct instream*, char*, STRLEN, STRLEN);
324    U32    (*read_int)(struct instream*);
325    double (*read_long)(struct instream*);
326    U32    (*read_vint)(struct instream*);
327    double (*read_vlong)(struct instream*);
328} InStream;
329
330InStream* Kino1_InStream_new     (char*, SV*, double, double);
331void   Kino1_InStream_seek       (InStream*, double);
332double Kino1_InStream_tell       (InStream*);
333void   Kino1_InStream_refill     (InStream*);
334char   Kino1_InStream_read_byte  (InStream*);
335void   Kino1_InStream_read_bytes (InStream*, char*, STRLEN);
336void   Kino1_InStream_read_chars (InStream*, char*, STRLEN, STRLEN);
337U32    Kino1_InStream_read_int   (InStream*);
338double Kino1_InStream_read_long  (InStream*);
339U32    Kino1_InStream_decode_vint(char**);
340U32    Kino1_InStream_read_vint  (InStream*);
341double Kino1_InStream_read_vlong (InStream*);
342void   Kino1_InStream_destroy    (InStream*);
343
344#endif /* include guard */
345
346__C__
347
348#include "KinoSearch1StoreInStream.h"
349
350
351InStream*
352Kino1_InStream_new(char *class, SV *fh_sv, double offset, double len ) {
353    InStream *instream;
354
355    /* allocate */
356    Kino1_New(0, instream, 1, InStream);
357
358    /* assign */
359    instream->fh_sv       = newSVsv(fh_sv);
360    instream->fh          = IoIFP( sv_2io(fh_sv) );
361    instream->offset      = offset;
362
363    /* init buffer */
364    instream->buf       = NULL;
365    instream->buf_start = 0;
366    instream->buf_len   = 0;
367    instream->buf_pos   = 0;
368
369    /* seek */
370    if (offset != 0) {
371        PerlIO_seek(instream->fh, offset, 0);
372    }
373
374    /* calculate len if an (intentionally) invalid value was supplied */
375    if (len < 0.0) {
376        double bookmark = PerlIO_tell(instream->fh);
377        PerlIO_seek(instream->fh, 0, 2);
378        len = PerlIO_tell(instream->fh);
379        PerlIO_seek(instream->fh, bookmark, 0);
380    }
381    instream->len = len;
382
383    /* assign methods */
384    instream->seek       = Kino1_InStream_seek;
385    instream->tell       = Kino1_InStream_tell;
386    instream->read_byte  = Kino1_InStream_read_byte;
387    instream->read_bytes = Kino1_InStream_read_bytes;
388    instream->read_chars = Kino1_InStream_read_chars;
389    instream->read_int   = Kino1_InStream_read_int;
390    instream->read_long  = Kino1_InStream_read_long;
391    instream->read_vint  = Kino1_InStream_read_vint;
392    instream->read_vlong = Kino1_InStream_read_vlong;
393
394    return instream;
395}
396
397void
398Kino1_InStream_seek(InStream *instream, double target) {
399    /* seek within buffer if possible */
400    if (   (target >= instream->buf_start)
401        && (target <  (instream->buf_start + instream->buf_pos))
402    ) {
403        instream->buf_pos = target - instream->buf_start;
404    }
405    /* nope, not possible, so seek within file and prepare to refill */
406    else {
407        instream->buf_start = target;
408        instream->buf_pos   = 0;
409        instream->buf_len   = 0;
410        PerlIO_seek(instream->fh, target + instream->offset, 0);
411    }
412}
413
414double
415Kino1_InStream_tell(InStream *instream) {
416    return instream->buf_start + instream->buf_pos;
417}
418
419void
420Kino1_InStream_refill(InStream *instream) {
421    int check_val;
422
423    /* wait to allocate buffer until it's needed */
424    if (instream->buf == NULL)
425        Kino1_New(0, instream->buf, KINO_IO_STREAM_BUF_SIZE, char);
426
427    /* add bytes read to file position, reset */
428    instream->buf_start += instream->buf_pos;
429    instream->buf_pos = 0;
430
431    /* calculate the number of bytes to read */
432    if (KINO_IO_STREAM_BUF_SIZE < instream->len - instream->buf_start)
433        instream->buf_len = KINO_IO_STREAM_BUF_SIZE;
434    else
435        instream->buf_len = instream->len - instream->buf_start;
436
437    /* perform the file operations */
438    PerlIO_seek(instream->fh, 0, 1);
439    check_val = PerlIO_seek(instream->fh,
440        (instream->buf_start + instream->offset), 0);
441    if (check_val == -1)
442        Kino1_confess("refill: PerlIO_seek failed: %d", errno);
443    check_val = PerlIO_read(instream->fh, instream->buf, instream->buf_len);
444    if (check_val != instream->buf_len)
445        Kino1_confess("refill: tried to read %d bytes, got %d: %d",
446            instream->buf_len, check_val, errno);
447}
448
449char
450Kino1_InStream_read_byte(InStream *instream) {
451    if (instream->buf_pos >= instream->buf_len)
452        Kino1_InStream_refill(instream);
453    return instream->buf[ instream->buf_pos++ ];
454}
455
456void
457Kino1_InStream_read_bytes (InStream *instream, char* buf, STRLEN len) {
458    if (instream->buf_pos + len < instream->buf_len) {
459        /* request is entirely within buffer, so copy */
460        Copy((instream->buf + instream->buf_pos), buf, len, char);
461        instream->buf_pos += len;
462    }
463    else {
464        /* get the request from the file and reset buffer */
465        int check_val;
466        Off_t start;
467        start = instream->tell(instream);
468        check_val = PerlIO_seek(instream->fh, (start + instream->offset), 0);
469        if (check_val == -1)
470            Kino1_confess("read_bytes: PerlIO_seek failed: %d", errno );
471        check_val = PerlIO_read(instream->fh, buf, len);
472        if (check_val < len)
473            Kino1_confess("read_bytes: tried to read %"UVuf" bytes, got %d",
474                (UV)len, check_val);
475
476        /* reset vars and refill if there's more in the file */
477        instream->buf_start = start + len;
478        instream->buf_pos   = 0;
479        instream->buf_len   = 0;
480        if (instream->buf_start < instream->len)
481            Kino1_InStream_refill(instream);
482    }
483}
484
485/* This is just a wrapper for read_bytes, but that may change.  It should
486 * be used whenever Lucene character data is being read, typically after
487 * read_vint as part of a String read. If and when a change does come, it will
488 * be a lot easier to track down all the relevant code fragments if read_chars
489 * gets used consistently.
490 */
491void
492Kino1_InStream_read_chars(InStream *instream, char *buf, STRLEN start,
493                         STRLEN len) {
494    buf += start;
495    instream->read_bytes(instream, buf, len);
496}
497
498U32
499Kino1_InStream_read_int (InStream *instream) {
500    unsigned char buf[4];
501    instream->read_bytes(instream, (char*)buf, 4);
502    return Kino1_decode_bigend_U32(buf);
503}
504
505double
506Kino1_InStream_read_long (InStream *instream) {
507    unsigned char buf[8];
508    double        aDouble;
509
510    /* get 8 bytes from the stream */
511    instream->read_bytes(instream, (char*)buf, 8);
512
513    /* get high 4 bytes, multiply by 2**32 */
514    aDouble = Kino1_decode_bigend_U32(buf);
515    aDouble = aDouble * pow(2.0, 32.0);
516
517    /* decode low four bytes as unsigned int and add to total */
518    aDouble += Kino1_decode_bigend_U32(&buf[4]);
519
520    return aDouble;
521}
522
523/* read in a Variable INTeger, stored in 1-5 bytes */
524U32
525Kino1_InStream_read_vint (InStream *instream) {
526    unsigned char aUChar;
527    int           bitshift;
528    U32           aU32;
529
530    /* start by reading one byte; use the lower 7 bits */
531    aUChar = (unsigned char)instream->read_byte(instream);
532    aU32 = aUChar & 0x7f;
533
534    /* keep reading and shifting as long as the high bit is set */
535    for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) {
536        aUChar = (unsigned char)instream->read_byte(instream);
537        aU32 |= (aUChar & 0x7f) << bitshift;
538    }
539    return aU32;
540}
541
542U32
543Kino1_InStream_decode_vint(char **source_ptr) {
544    char *source;
545    int   bitshift;
546    U32   aU32;
547
548    source = *source_ptr;
549    aU32 = (unsigned char)*source & 0x7f;
550    for (bitshift = 7; (*source & 0x80) != 0; bitshift += 7) {
551        source++;
552         aU32 |= ((unsigned char)*source & 0x7f) << bitshift;
553    }
554    source++;
555    *source_ptr = source;
556    return aU32;
557}
558
559double
560Kino1_InStream_read_vlong (InStream *instream) {
561    unsigned char aUChar;
562    int           bitshift;
563    double        aDouble;
564
565    aUChar = (unsigned char)instream->read_byte(instream);
566    aDouble = aUChar & 0x7f;
567    for (bitshift = 7; (aUChar & 0x80) != 0; bitshift += 7) {
568        aUChar = (unsigned char)instream->read_byte(instream);
569        aDouble += (aUChar & 0x7f) * pow(2, bitshift);
570    }
571    return aDouble;
572}
573
574
575void
576Kino1_InStream_destroy(InStream* instream) {
577    SvREFCNT_dec(instream->fh_sv);
578    Kino1_Safefree(instream->buf);
579    Kino1_Safefree(instream);
580}
581
582__POD__
583
584==begin devdocs
585
586==head1 NAME
587
588KinoSearch1::Store::InStream - filehandles for reading invindexes
589
590==head1 SYNOPSIS
591
592    # isa blessed filehandle
593
594    my $instream  = $invindex->open_instream( $filehandle, $offset, $length );
595    my @ten_vints = $instream->lu_read('V10');
596
597==head1 DESCRIPTION
598
599The InStream class abstracts out all input operations to KinoSearch1.
600
601InStream is implemented as a inside-out object around a blessed filehandle.
602It would almost be possible to use an ordinary filehandle, but the
603objectification is necessary because InStreams have to be capable of
604pretending that they are acting upon a distinct file when in reality they may
605be reading only a portion of a compound file.
606
607For the template used by lu_read, see InStream's companion,
608L<OutStream|KinoSearch1::Store::OutStream>.
609
610==head1 COPYRIGHT
611
612Copyright 2005-2010 Marvin Humphrey
613
614==head1 LICENSE, DISCLAIMER, BUGS, etc.
615
616See L<KinoSearch1> version 1.01.
617
618==end devdocs
619==cut
620
621