1# Copyright 2011 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6#    1. Redistributions of source code must retain the above copyright notice,
7#       this list of conditions and the following disclaimer.
8#
9#    2. Redistributions in binary form must reproduce the above copyright
10#       notice, this list of conditions and the following disclaimer in the
11#       documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""
29This module contains base classes/interfaces for "codec" objects.
30"""
31
32from bisect import bisect_right
33
34from whoosh import columns
35from whoosh.automata import lev
36from whoosh.compat import abstractmethod, izip, unichr, xrange
37from whoosh.filedb.compound import CompoundStorage
38from whoosh.system import emptybytes
39from whoosh.util import random_name
40
41
42# Exceptions
43
44class OutOfOrderError(Exception):
45    pass
46
47
48# Base classes
49
50class Codec(object):
51    length_stats = True
52
53    # Per document value writer
54
55    @abstractmethod
56    def per_document_writer(self, storage, segment):
57        raise NotImplementedError
58
59    # Inverted index writer
60
61    @abstractmethod
62    def field_writer(self, storage, segment):
63        raise NotImplementedError
64
65    # Postings
66
67    @abstractmethod
68    def postings_writer(self, dbfile, byteids=False):
69        raise NotImplementedError
70
71    @abstractmethod
72    def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
73        raise NotImplementedError
74
75    # Index readers
76
77    def automata(self, storage, segment):
78        return Automata()
79
80    @abstractmethod
81    def terms_reader(self, storage, segment):
82        raise NotImplementedError
83
84    @abstractmethod
85    def per_document_reader(self, storage, segment):
86        raise NotImplementedError
87
88    # Segments and generations
89
90    @abstractmethod
91    def new_segment(self, storage, indexname):
92        raise NotImplementedError
93
94
95class WrappingCodec(Codec):
96    def __init__(self, child):
97        self._child = child
98
99    def per_document_writer(self, storage, segment):
100        return self._child.per_document_writer(storage, segment)
101
102    def field_writer(self, storage, segment):
103        return self._child.field_writer(storage, segment)
104
105    def postings_writer(self, dbfile, byteids=False):
106        return self._child.postings_writer(dbfile, byteids=byteids)
107
108    def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
109        return self._child.postings_reader(dbfile, terminfo, format_, term=term,
110                                           scorer=scorer)
111
112    def automata(self, storage, segment):
113        return self._child.automata(storage, segment)
114
115    def terms_reader(self, storage, segment):
116        return self._child.terms_reader(storage, segment)
117
118    def per_document_reader(self, storage, segment):
119        return self._child.per_document_reader(storage, segment)
120
121    def new_segment(self, storage, indexname):
122        return self._child.new_segment(storage, indexname)
123
124
125# Writer classes
126
127class PerDocumentWriter(object):
128    @abstractmethod
129    def start_doc(self, docnum):
130        raise NotImplementedError
131
132    @abstractmethod
133    def add_field(self, fieldname, fieldobj, value, length):
134        raise NotImplementedError
135
136    @abstractmethod
137    def add_column_value(self, fieldname, columnobj, value):
138        raise NotImplementedError("Codec does not implement writing columns")
139
140    @abstractmethod
141    def add_vector_items(self, fieldname, fieldobj, items):
142        raise NotImplementedError
143
144    def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
145        def readitems():
146            while vmatcher.is_active():
147                text = vmatcher.id()
148                weight = vmatcher.weight()
149                valuestring = vmatcher.value()
150                yield (text, weight, valuestring)
151                vmatcher.next()
152        self.add_vector_items(fieldname, fieldobj, readitems())
153
154    def finish_doc(self):
155        pass
156
157    def close(self):
158        pass
159
160
161class FieldWriter(object):
162    def add_postings(self, schema, lengths, items):
163        # This method translates a generator of (fieldname, btext, docnum, w, v)
164        # postings into calls to start_field(), start_term(), add(),
165        # finish_term(), finish_field(), etc.
166
167        start_field = self.start_field
168        start_term = self.start_term
169        add = self.add
170        finish_term = self.finish_term
171        finish_field = self.finish_field
172
173        if lengths:
174            dfl = lengths.doc_field_length
175        else:
176            dfl = lambda docnum, fieldname: 0
177
178        # The fieldname of the previous posting
179        lastfn = None
180        # The bytes text of the previous posting
181        lasttext = None
182        # The (fieldname, btext) of the previous spelling posting
183        lastspell = None
184        # The field object for the current field
185        fieldobj = None
186        for fieldname, btext, docnum, weight, value in items:
187            # Check for out-of-order postings. This is convoluted because Python
188            # 3 removed the ability to compare a string to None
189            if lastfn is not None and fieldname < lastfn:
190                raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname))
191            if fieldname == lastfn and lasttext and btext < lasttext:
192                raise OutOfOrderError("Term %s:%r .. %s:%r"
193                                      % (lastfn, lasttext, fieldname, btext))
194
195            # If the fieldname of this posting is different from the last one,
196            # tell the writer we're starting a new field
197            if fieldname != lastfn:
198                if lasttext is not None:
199                    finish_term()
200                if lastfn is not None and fieldname != lastfn:
201                    finish_field()
202                fieldobj = schema[fieldname]
203                start_field(fieldname, fieldobj)
204                lastfn = fieldname
205                lasttext = None
206
207            # HACK: items where docnum == -1 indicate words that should be added
208            # to the spelling graph, not the postings
209            if docnum == -1:
210                # spellterm = (fieldname, btext)
211                # # There can be duplicates of spelling terms, so only add a spell
212                # # term if it's greater than the last one
213                # if lastspell is None or spellterm > lastspell:
214                #     spellword = fieldobj.from_bytes(btext)
215                #     self.add_spell_word(fieldname, spellword)
216                #     lastspell = spellterm
217                continue
218
219            # If this term is different from the term in the previous posting,
220            # tell the writer to start a new term
221            if btext != lasttext:
222                if lasttext is not None:
223                    finish_term()
224                start_term(btext)
225                lasttext = btext
226
227            # Add this posting
228            length = dfl(docnum, fieldname)
229            if value is None:
230                value = emptybytes
231            add(docnum, weight, value, length)
232
233        if lasttext is not None:
234            finish_term()
235        if lastfn is not None:
236            finish_field()
237
238    @abstractmethod
239    def start_field(self, fieldname, fieldobj):
240        raise NotImplementedError
241
242    @abstractmethod
243    def start_term(self, text):
244        raise NotImplementedError
245
246    @abstractmethod
247    def add(self, docnum, weight, vbytes, length):
248        raise NotImplementedError
249
250    def add_spell_word(self, fieldname, text):
251        raise NotImplementedError
252
253    @abstractmethod
254    def finish_term(self):
255        raise NotImplementedError
256
257    def finish_field(self):
258        pass
259
260    def close(self):
261        pass
262
263
264# Postings
265
266class PostingsWriter(object):
267    @abstractmethod
268    def start_postings(self, format_, terminfo):
269        raise NotImplementedError
270
271    @abstractmethod
272    def add_posting(self, id_, weight, vbytes, length=None):
273        raise NotImplementedError
274
275    def finish_postings(self):
276        pass
277
278    @abstractmethod
279    def written(self):
280        """Returns True if this object has already written to disk.
281        """
282
283        raise NotImplementedError
284
285
286# Reader classes
287
288class FieldCursor(object):
289    def first(self):
290        raise NotImplementedError
291
292    def find(self, string):
293        raise NotImplementedError
294
295    def next(self):
296        raise NotImplementedError
297
298    def term(self):
299        raise NotImplementedError
300
301
302class TermsReader(object):
303    @abstractmethod
304    def __contains__(self, term):
305        raise NotImplementedError
306
307    @abstractmethod
308    def cursor(self, fieldname, fieldobj):
309        raise NotImplementedError
310
311    @abstractmethod
312    def terms(self):
313        raise NotImplementedError
314
315    @abstractmethod
316    def terms_from(self, fieldname, prefix):
317        raise NotImplementedError
318
319    @abstractmethod
320    def items(self):
321        raise NotImplementedError
322
323    @abstractmethod
324    def items_from(self, fieldname, prefix):
325        raise NotImplementedError
326
327    @abstractmethod
328    def term_info(self, fieldname, text):
329        raise NotImplementedError
330
331    @abstractmethod
332    def frequency(self, fieldname, text):
333        return self.term_info(fieldname, text).weight()
334
335    @abstractmethod
336    def doc_frequency(self, fieldname, text):
337        return self.term_info(fieldname, text).doc_frequency()
338
339    @abstractmethod
340    def matcher(self, fieldname, text, format_, scorer=None):
341        raise NotImplementedError
342
343    @abstractmethod
344    def indexed_field_names(self):
345        raise NotImplementedError
346
347    def close(self):
348        pass
349
350
351class Automata(object):
352    @staticmethod
353    def levenshtein_dfa(uterm, maxdist, prefix=0):
354        return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa()
355
356    @staticmethod
357    def find_matches(dfa, cur):
358        unull = unichr(0)
359
360        term = cur.text()
361        if term is None:
362            return
363
364        match = dfa.next_valid_string(term)
365        while match:
366            cur.find(match)
367            term = cur.text()
368            if term is None:
369                return
370            if match == term:
371                yield match
372                term += unull
373            match = dfa.next_valid_string(term)
374
375    def terms_within(self, fieldcur, uterm, maxdist, prefix=0):
376        dfa = self.levenshtein_dfa(uterm, maxdist, prefix)
377        return self.find_matches(dfa, fieldcur)
378
379
380# Per-doc value reader
381
382class PerDocumentReader(object):
383    def close(self):
384        pass
385
386    @abstractmethod
387    def doc_count(self):
388        raise NotImplementedError
389
390    @abstractmethod
391    def doc_count_all(self):
392        raise NotImplementedError
393
394    # Deletions
395
396    @abstractmethod
397    def has_deletions(self):
398        raise NotImplementedError
399
400    @abstractmethod
401    def is_deleted(self, docnum):
402        raise NotImplementedError
403
404    @abstractmethod
405    def deleted_docs(self):
406        raise NotImplementedError
407
408    def all_doc_ids(self):
409        """
410        Returns an iterator of all (undeleted) document IDs in the reader.
411        """
412
413        is_deleted = self.is_deleted
414        return (docnum for docnum in xrange(self.doc_count_all())
415                if not is_deleted(docnum))
416
417    def iter_docs(self):
418        for docnum in self.all_doc_ids():
419            yield docnum, self.stored_fields(docnum)
420
421    # Columns
422
423    def supports_columns(self):
424        return False
425
426    def has_column(self, fieldname):
427        return False
428
429    def list_columns(self):
430        raise NotImplementedError
431
432    # Don't need to override this if supports_columns() returns False
433    def column_reader(self, fieldname, column):
434        raise NotImplementedError
435
436    # Bitmaps
437
438    def field_docs(self, fieldname):
439        return None
440
441    # Lengths
442
443    @abstractmethod
444    def doc_field_length(self, docnum, fieldname, default=0):
445        raise NotImplementedError
446
447    @abstractmethod
448    def field_length(self, fieldname):
449        raise NotImplementedError
450
451    @abstractmethod
452    def min_field_length(self, fieldname):
453        raise NotImplementedError
454
455    @abstractmethod
456    def max_field_length(self, fieldname):
457        raise NotImplementedError
458
459    # Vectors
460
461    def has_vector(self, docnum, fieldname):
462        return False
463
464    # Don't need to override this if has_vector() always returns False
465    def vector(self, docnum, fieldname, format_):
466        raise NotImplementedError
467
468    # Stored
469
470    @abstractmethod
471    def stored_fields(self, docnum):
472        raise NotImplementedError
473
474    def all_stored_fields(self):
475        for docnum in self.all_doc_ids():
476            yield self.stored_fields(docnum)
477
478
479# Segment base class
480
481class Segment(object):
482    """Do not instantiate this object directly. It is used by the Index object
483    to hold information about a segment. A list of objects of this class are
484    pickled as part of the TOC file.
485
486    The TOC file stores a minimal amount of information -- mostly a list of
487    Segment objects. Segments are the real reverse indexes. Having multiple
488    segments allows quick incremental indexing: just create a new segment for
489    the new documents, and have the index overlay the new segment over previous
490    ones for purposes of reading/search. "Optimizing" the index combines the
491    contents of existing segments into one (removing any deleted documents
492    along the way).
493    """
494
495    # Extension for compound segment files
496    COMPOUND_EXT = ".seg"
497
498    # self.indexname
499    # self.segid
500
501    def __init__(self, indexname):
502        self.indexname = indexname
503        self.segid = self._random_id()
504        self.compound = False
505
506    @classmethod
507    def _random_id(cls, size=16):
508        return random_name(size=size)
509
510    def __repr__(self):
511        return "<%s %s>" % (self.__class__.__name__, self.segment_id())
512
513    def codec(self):
514        raise NotImplementedError
515
516    def index_name(self):
517        return self.indexname
518
519    def segment_id(self):
520        if hasattr(self, "name"):
521            # Old segment class
522            return self.name
523        else:
524            return "%s_%s" % (self.index_name(), self.segid)
525
526    def is_compound(self):
527        if not hasattr(self, "compound"):
528            return False
529        return self.compound
530
531    # File convenience methods
532
533    def make_filename(self, ext):
534        return "%s%s" % (self.segment_id(), ext)
535
536    def list_files(self, storage):
537        prefix = "%s." % self.segment_id()
538        return [name for name in storage.list() if name.startswith(prefix)]
539
540    def create_file(self, storage, ext, **kwargs):
541        """Convenience method to create a new file in the given storage named
542        with this segment's ID and the given extension. Any keyword arguments
543        are passed to the storage's create_file method.
544        """
545
546        fname = self.make_filename(ext)
547        return storage.create_file(fname, **kwargs)
548
549    def open_file(self, storage, ext, **kwargs):
550        """Convenience method to open a file in the given storage named with
551        this segment's ID and the given extension. Any keyword arguments are
552        passed to the storage's open_file method.
553        """
554
555        fname = self.make_filename(ext)
556        return storage.open_file(fname, **kwargs)
557
558    def create_compound_file(self, storage):
559        segfiles = self.list_files(storage)
560        assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles)
561        cfile = self.create_file(storage, self.COMPOUND_EXT)
562        CompoundStorage.assemble(cfile, storage, segfiles)
563        for name in segfiles:
564            storage.delete_file(name)
565        self.compound = True
566
567    def open_compound_file(self, storage):
568        name = self.make_filename(self.COMPOUND_EXT)
569        dbfile = storage.open_file(name)
570        return CompoundStorage(dbfile, use_mmap=storage.supports_mmap)
571
572    # Abstract methods
573
574    @abstractmethod
575    def doc_count_all(self):
576        """
577        Returns the total number of documents, DELETED OR UNDELETED, in this
578        segment.
579        """
580
581        raise NotImplementedError
582
583    def doc_count(self):
584        """
585        Returns the number of (undeleted) documents in this segment.
586        """
587
588        return self.doc_count_all() - self.deleted_count()
589
590    def set_doc_count(self, doccount):
591        raise NotImplementedError
592
593    def has_deletions(self):
594        """
595        Returns True if any documents in this segment are deleted.
596        """
597
598        return self.deleted_count() > 0
599
600    @abstractmethod
601    def deleted_count(self):
602        """
603        Returns the total number of deleted documents in this segment.
604        """
605
606        raise NotImplementedError
607
608    @abstractmethod
609    def deleted_docs(self):
610        raise NotImplementedError
611
612    @abstractmethod
613    def delete_document(self, docnum, delete=True):
614        """Deletes the given document number. The document is not actually
615        removed from the index until it is optimized.
616
617        :param docnum: The document number to delete.
618        :param delete: If False, this undeletes a deleted document.
619        """
620
621        raise NotImplementedError
622
623    @abstractmethod
624    def is_deleted(self, docnum):
625        """
626        Returns True if the given document number is deleted.
627        """
628
629        raise NotImplementedError
630
631    def should_assemble(self):
632        return True
633
634
635# Wrapping Segment
636
637class WrappingSegment(Segment):
638    def __init__(self, child):
639        self._child = child
640
641    def codec(self):
642        return self._child.codec()
643
644    def index_name(self):
645        return self._child.index_name()
646
647    def segment_id(self):
648        return self._child.segment_id()
649
650    def is_compound(self):
651        return self._child.is_compound()
652
653    def should_assemble(self):
654        return self._child.should_assemble()
655
656    def make_filename(self, ext):
657        return self._child.make_filename(ext)
658
659    def list_files(self, storage):
660        return self._child.list_files(storage)
661
662    def create_file(self, storage, ext, **kwargs):
663        return self._child.create_file(storage, ext, **kwargs)
664
665    def open_file(self, storage, ext, **kwargs):
666        return self._child.open_file(storage, ext, **kwargs)
667
668    def create_compound_file(self, storage):
669        return self._child.create_compound_file(storage)
670
671    def open_compound_file(self, storage):
672        return self._child.open_compound_file(storage)
673
674    def delete_document(self, docnum, delete=True):
675        return self._child.delete_document(docnum, delete=delete)
676
677    def has_deletions(self):
678        return self._child.has_deletions()
679
680    def deleted_count(self):
681        return self._child.deleted_count()
682
683    def deleted_docs(self):
684        return self._child.deleted_docs()
685
686    def is_deleted(self, docnum):
687        return self._child.is_deleted(docnum)
688
689    def set_doc_count(self, doccount):
690        self._child.set_doc_count(doccount)
691
692    def doc_count(self):
693        return self._child.doc_count()
694
695    def doc_count_all(self):
696        return self._child.doc_count_all()
697
698
699# Multi per doc reader
700
701class MultiPerDocumentReader(PerDocumentReader):
702    def __init__(self, readers, offset=0):
703        self._readers = readers
704
705        self._doc_offsets = []
706        self._doccount = 0
707        for pdr in readers:
708            self._doc_offsets.append(self._doccount)
709            self._doccount += pdr.doc_count_all()
710
711        self.is_closed = False
712
713    def close(self):
714        for r in self._readers:
715            r.close()
716        self.is_closed = True
717
718    def doc_count_all(self):
719        return self._doccount
720
721    def doc_count(self):
722        total = 0
723        for r in self._readers:
724            total += r.doc_count()
725        return total
726
727    def _document_reader(self, docnum):
728        return max(0, bisect_right(self._doc_offsets, docnum) - 1)
729
730    def _reader_and_docnum(self, docnum):
731        rnum = self._document_reader(docnum)
732        offset = self._doc_offsets[rnum]
733        return rnum, docnum - offset
734
735    # Deletions
736
737    def has_deletions(self):
738        return any(r.has_deletions() for r in self._readers)
739
740    def is_deleted(self, docnum):
741        x, y = self._reader_and_docnum(docnum)
742        return self._readers[x].is_deleted(y)
743
744    def deleted_docs(self):
745        for r, offset in izip(self._readers, self._doc_offsets):
746            for docnum in r.deleted_docs():
747                yield docnum + offset
748
749    def all_doc_ids(self):
750        for r, offset in izip(self._readers, self._doc_offsets):
751            for docnum in r.all_doc_ids():
752                yield docnum + offset
753
754    # Columns
755
756    def has_column(self, fieldname):
757        return any(r.has_column(fieldname) for r in self._readers)
758
759    def column_reader(self, fieldname, column):
760        if not self.has_column(fieldname):
761            raise ValueError("No column %r" % (fieldname,))
762
763        default = column.default_value()
764        colreaders = []
765        for r in self._readers:
766            if r.has_column(fieldname):
767                cr = r.column_reader(fieldname, column)
768            else:
769                cr = columns.EmptyColumnReader(default, r.doc_count_all())
770            colreaders.append(cr)
771
772        if len(colreaders) == 1:
773            return colreaders[0]
774        else:
775            return columns.MultiColumnReader(colreaders)
776
777    # Lengths
778
779    def doc_field_length(self, docnum, fieldname, default=0):
780        x, y = self._reader_and_docnum(docnum)
781        return self._readers[x].doc_field_length(y, fieldname, default)
782
783    def field_length(self, fieldname):
784        total = 0
785        for r in self._readers:
786            total += r.field_length(fieldname)
787        return total
788
789    def min_field_length(self):
790        return min(r.min_field_length() for r in self._readers)
791
792    def max_field_length(self):
793        return max(r.max_field_length() for r in self._readers)
794
795
796# Extended base classes
797
798class PerDocWriterWithColumns(PerDocumentWriter):
799    def __init__(self):
800        PerDocumentWriter.__init__(self)
801        # Implementations need to set these attributes
802        self._storage = None
803        self._segment = None
804        self._docnum = None
805
806    @abstractmethod
807    def _has_column(self, fieldname):
808        raise NotImplementedError
809
810    @abstractmethod
811    def _create_column(self, fieldname, column):
812        raise NotImplementedError
813
814    @abstractmethod
815    def _get_column(self, fieldname):
816        raise NotImplementedError
817
818    def add_column_value(self, fieldname, column, value):
819        if not self._has_column(fieldname):
820            self._create_column(fieldname, column)
821        self._get_column(fieldname).add(self._docnum, value)
822
823
824# FieldCursor implementations
825
826class EmptyCursor(FieldCursor):
827    def first(self):
828        return None
829
830    def find(self, term):
831        return None
832
833    def next(self):
834        return None
835
836    def text(self):
837        return None
838
839    def term_info(self):
840        return None
841
842    def is_valid(self):
843        return False
844