whoosh/codec/base.py

# Copyright 2011 Matt Chaput. All rights reserved.
#
# Redistribution and use in source and binary forms, with or without
# modification, are permitted provided that the following conditions are met:
#
#    1. Redistributions of source code must retain the above copyright notice,
#       this list of conditions and the following disclaimer.
#
#    2. Redistributions in binary form must reproduce the above copyright
#       notice, this list of conditions and the following disclaimer in the
#       documentation and/or other materials provided with the distribution.
#
# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
#
# The views and conclusions contained in the software and documentation are
# those of the authors and should not be interpreted as representing official
# policies, either expressed or implied, of Matt Chaput.

"""
This module contains base classes/interfaces for "codec" objects.
"""

from bisect import bisect_right

from whoosh import columns
from whoosh.automata import lev
from whoosh.compat import abstractmethod, izip, unichr, xrange
from whoosh.filedb.compound import CompoundStorage
from whoosh.system import emptybytes
from whoosh.util import random_name


# Exceptions

class OutOfOrderError(Exception):
    pass


# Base classes

class Codec(object):
    length_stats = True

    # Per document value writer

    @abstractmethod
    def per_document_writer(self, storage, segment):
        raise NotImplementedError

    # Inverted index writer

    @abstractmethod
    def field_writer(self, storage, segment):
        raise NotImplementedError

    # Postings

    @abstractmethod
    def postings_writer(self, dbfile, byteids=False):
        raise NotImplementedError

    @abstractmethod
    def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
        raise NotImplementedError

    # Index readers

    def automata(self, storage, segment):
        return Automata()

    @abstractmethod
    def terms_reader(self, storage, segment):
        raise NotImplementedError

    @abstractmethod
    def per_document_reader(self, storage, segment):
        raise NotImplementedError

    # Segments and generations

    @abstractmethod
    def new_segment(self, storage, indexname):
        raise NotImplementedError


class WrappingCodec(Codec):
    def __init__(self, child):
        self._child = child

    def per_document_writer(self, storage, segment):
        return self._child.per_document_writer(storage, segment)

    def field_writer(self, storage, segment):
        return self._child.field_writer(storage, segment)

    def postings_writer(self, dbfile, byteids=False):
        return self._child.postings_writer(dbfile, byteids=byteids)

    def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None):
        return self._child.postings_reader(dbfile, terminfo, format_, term=term,
                                           scorer=scorer)

    def automata(self, storage, segment):
        return self._child.automata(storage, segment)

    def terms_reader(self, storage, segment):
        return self._child.terms_reader(storage, segment)

    def per_document_reader(self, storage, segment):
        return self._child.per_document_reader(storage, segment)

    def new_segment(self, storage, indexname):
        return self._child.new_segment(storage, indexname)


# Writer classes

class PerDocumentWriter(object):
    @abstractmethod
    def start_doc(self, docnum):
        raise NotImplementedError

    @abstractmethod
    def add_field(self, fieldname, fieldobj, value, length):
        raise NotImplementedError

    @abstractmethod
    def add_column_value(self, fieldname, columnobj, value):
        raise NotImplementedError("Codec does not implement writing columns")

    @abstractmethod
    def add_vector_items(self, fieldname, fieldobj, items):
        raise NotImplementedError

    def add_vector_matcher(self, fieldname, fieldobj, vmatcher):
        def readitems():
            while vmatcher.is_active():
                text = vmatcher.id()
                weight = vmatcher.weight()
                valuestring = vmatcher.value()
                yield (text, weight, valuestring)
                vmatcher.next()
        self.add_vector_items(fieldname, fieldobj, readitems())

    def finish_doc(self):
        pass

    def close(self):
        pass


class FieldWriter(object):
    def add_postings(self, schema, lengths, items):
        # This method translates a generator of (fieldname, btext, docnum, w, v)
        # postings into calls to start_field(), start_term(), add(),
        # finish_term(), finish_field(), etc.

        start_field = self.start_field
        start_term = self.start_term
        add = self.add
        finish_term = self.finish_term
        finish_field = self.finish_field

        if lengths:
            dfl = lengths.doc_field_length
        else:
            dfl = lambda docnum, fieldname: 0

        # The fieldname of the previous posting
        lastfn = None
        # The bytes text of the previous posting
        lasttext = None
        # The (fieldname, btext) of the previous spelling posting
        lastspell = None
        # The field object for the current field
        fieldobj = None
        for fieldname, btext, docnum, weight, value in items:
            # Check for out-of-order postings. This is convoluted because Python
            # 3 removed the ability to compare a string to None
            if lastfn is not None and fieldname < lastfn:
                raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname))
            if fieldname == lastfn and lasttext and btext < lasttext:
                raise OutOfOrderError("Term %s:%r .. %s:%r"
                                      % (lastfn, lasttext, fieldname, btext))

            # If the fieldname of this posting is different from the last one,
            # tell the writer we're starting a new field
            if fieldname != lastfn:
                if lasttext is not None:
                    finish_term()
                if lastfn is not None and fieldname != lastfn:
                    finish_field()
                fieldobj = schema[fieldname]
                start_field(fieldname, fieldobj)
                lastfn = fieldname
                lasttext = None

            # HACK: items where docnum == -1 indicate words that should be added
            # to the spelling graph, not the postings
            if docnum == -1:
                # spellterm = (fieldname, btext)
                # # There can be duplicates of spelling terms, so only add a spell
                # # term if it's greater than the last one
                # if lastspell is None or spellterm > lastspell:
                #     spellword = fieldobj.from_bytes(btext)
                #     self.add_spell_word(fieldname, spellword)
                #     lastspell = spellterm
                continue

            # If this term is different from the term in the previous posting,
            # tell the writer to start a new term
            if btext != lasttext:
                if lasttext is not None:
                    finish_term()
                start_term(btext)
                lasttext = btext

            # Add this posting
            length = dfl(docnum, fieldname)
            if value is None:
                value = emptybytes
            add(docnum, weight, value, length)

        if lasttext is not None:
            finish_term()
        if lastfn is not None:
            finish_field()

    @abstractmethod
    def start_field(self, fieldname, fieldobj):
        raise NotImplementedError

    @abstractmethod
    def start_term(self, text):
        raise NotImplementedError

    @abstractmethod
    def add(self, docnum, weight, vbytes, length):
        raise NotImplementedError

    def add_spell_word(self, fieldname, text):
        raise NotImplementedError

    @abstractmethod
    def finish_term(self):
        raise NotImplementedError

    def finish_field(self):
        pass

    def close(self):
        pass


# Postings

class PostingsWriter(object):
    @abstractmethod
    def start_postings(self, format_, terminfo):
        raise NotImplementedError

    @abstractmethod
    def add_posting(self, id_, weight, vbytes, length=None):
        raise NotImplementedError

    def finish_postings(self):
        pass

    @abstractmethod
    def written(self):
        """Returns True if this object has already written to disk.
        """

        raise NotImplementedError


# Reader classes

class FieldCursor(object):
    def first(self):
        raise NotImplementedError

    def find(self, string):
        raise NotImplementedError

    def next(self):
        raise NotImplementedError

    def term(self):
        raise NotImplementedError


class TermsReader(object):
    @abstractmethod
    def __contains__(self, term):
        raise NotImplementedError

    @abstractmethod
    def cursor(self, fieldname, fieldobj):
        raise NotImplementedError

    @abstractmethod
    def terms(self):
        raise NotImplementedError

    @abstractmethod
    def terms_from(self, fieldname, prefix):
        raise NotImplementedError

    @abstractmethod
    def items(self):
        raise NotImplementedError

    @abstractmethod
    def items_from(self, fieldname, prefix):
        raise NotImplementedError

    @abstractmethod
    def term_info(self, fieldname, text):
        raise NotImplementedError

    @abstractmethod
    def frequency(self, fieldname, text):
        return self.term_info(fieldname, text).weight()

    @abstractmethod
    def doc_frequency(self, fieldname, text):
        return self.term_info(fieldname, text).doc_frequency()

    @abstractmethod
    def matcher(self, fieldname, text, format_, scorer=None):
        raise NotImplementedError

    @abstractmethod
    def indexed_field_names(self):
        raise NotImplementedError

    def close(self):
        pass


class Automata(object):
    @staticmethod
    def levenshtein_dfa(uterm, maxdist, prefix=0):
        return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa()

    @staticmethod
    def find_matches(dfa, cur):
        unull = unichr(0)

        term = cur.text()
        if term is None:
            return

        match = dfa.next_valid_string(term)
        while match:
            cur.find(match)
            term = cur.text()
            if term is None:
                return
            if match == term:
                yield match
                term += unull
            match = dfa.next_valid_string(term)

    def terms_within(self, fieldcur, uterm, maxdist, prefix=0):
        dfa = self.levenshtein_dfa(uterm, maxdist, prefix)
        return self.find_matches(dfa, fieldcur)


# Per-doc value reader

class PerDocumentReader(object):
    def close(self):
        pass

    @abstractmethod
    def doc_count(self):
        raise NotImplementedError

    @abstractmethod
    def doc_count_all(self):
        raise NotImplementedError

    # Deletions

    @abstractmethod
    def has_deletions(self):
        raise NotImplementedError

    @abstractmethod
    def is_deleted(self, docnum):
        raise NotImplementedError

    @abstractmethod
    def deleted_docs(self):
        raise NotImplementedError

    def all_doc_ids(self):
        """
        Returns an iterator of all (undeleted) document IDs in the reader.
        """

        is_deleted = self.is_deleted
        return (docnum for docnum in xrange(self.doc_count_all())
                if not is_deleted(docnum))

    def iter_docs(self):
        for docnum in self.all_doc_ids():
            yield docnum, self.stored_fields(docnum)

    # Columns

    def supports_columns(self):
        return False

    def has_column(self, fieldname):
        return False

    def list_columns(self):
        raise NotImplementedError

    # Don't need to override this if supports_columns() returns False
    def column_reader(self, fieldname, column):
        raise NotImplementedError

    # Bitmaps

    def field_docs(self, fieldname):
        return None

    # Lengths

    @abstractmethod
    def doc_field_length(self, docnum, fieldname, default=0):
        raise NotImplementedError

    @abstractmethod
    def field_length(self, fieldname):
        raise NotImplementedError

    @abstractmethod
    def min_field_length(self, fieldname):
        raise NotImplementedError

    @abstractmethod
    def max_field_length(self, fieldname):
        raise NotImplementedError

    # Vectors

    def has_vector(self, docnum, fieldname):
        return False

    # Don't need to override this if has_vector() always returns False
    def vector(self, docnum, fieldname, format_):
        raise NotImplementedError

    # Stored

    @abstractmethod
    def stored_fields(self, docnum):
        raise NotImplementedError

    def all_stored_fields(self):
        for docnum in self.all_doc_ids():
            yield self.stored_fields(docnum)


# Segment base class

class Segment(object):
    """Do not instantiate this object directly. It is used by the Index object
    to hold information about a segment. A list of objects of this class are
    pickled as part of the TOC file.

    The TOC file stores a minimal amount of information -- mostly a list of
    Segment objects. Segments are the real reverse indexes. Having multiple
    segments allows quick incremental indexing: just create a new segment for
    the new documents, and have the index overlay the new segment over previous
    ones for purposes of reading/search. "Optimizing" the index combines the
    contents of existing segments into one (removing any deleted documents
    along the way).
    """

    # Extension for compound segment files
    COMPOUND_EXT = ".seg"

    # self.indexname
    # self.segid

    def __init__(self, indexname):
        self.indexname = indexname
        self.segid = self._random_id()
        self.compound = False

    @classmethod
    def _random_id(cls, size=16):
        return random_name(size=size)

    def __repr__(self):
        return "<%s %s>" % (self.__class__.__name__, self.segment_id())

    def codec(self):
        raise NotImplementedError

    def index_name(self):
        return self.indexname

    def segment_id(self):
        if hasattr(self, "name"):
            # Old segment class
            return self.name
        else:
            return "%s_%s" % (self.index_name(), self.segid)

    def is_compound(self):
        if not hasattr(self, "compound"):
            return False
        return self.compound

    # File convenience methods

    def make_filename(self, ext):
        return "%s%s" % (self.segment_id(), ext)

    def list_files(self, storage):
        prefix = "%s." % self.segment_id()
        return [name for name in storage.list() if name.startswith(prefix)]

    def create_file(self, storage, ext, **kwargs):
        """Convenience method to create a new file in the given storage named
        with this segment's ID and the given extension. Any keyword arguments
        are passed to the storage's create_file method.
        """

        fname = self.make_filename(ext)
        return storage.create_file(fname, **kwargs)

    def open_file(self, storage, ext, **kwargs):
        """Convenience method to open a file in the given storage named with
        this segment's ID and the given extension. Any keyword arguments are
        passed to the storage's open_file method.
        """

        fname = self.make_filename(ext)
        return storage.open_file(fname, **kwargs)

    def create_compound_file(self, storage):
        segfiles = self.list_files(storage)
        assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles)
        cfile = self.create_file(storage, self.COMPOUND_EXT)
        CompoundStorage.assemble(cfile, storage, segfiles)
        for name in segfiles:
            storage.delete_file(name)
        self.compound = True

    def open_compound_file(self, storage):
        name = self.make_filename(self.COMPOUND_EXT)
        dbfile = storage.open_file(name)
        return CompoundStorage(dbfile, use_mmap=storage.supports_mmap)

    # Abstract methods

    @abstractmethod
    def doc_count_all(self):
        """
        Returns the total number of documents, DELETED OR UNDELETED, in this
        segment.
        """

        raise NotImplementedError

    def doc_count(self):
        """
        Returns the number of (undeleted) documents in this segment.
        """

        return self.doc_count_all() - self.deleted_count()

    def set_doc_count(self, doccount):
        raise NotImplementedError

    def has_deletions(self):
        """
        Returns True if any documents in this segment are deleted.
        """

        return self.deleted_count() > 0

    @abstractmethod
    def deleted_count(self):
        """
        Returns the total number of deleted documents in this segment.
        """

        raise NotImplementedError

    @abstractmethod
    def deleted_docs(self):
        raise NotImplementedError

    @abstractmethod
    def delete_document(self, docnum, delete=True):
        """Deletes the given document number. The document is not actually
        removed from the index until it is optimized.

        :param docnum: The document number to delete.
        :param delete: If False, this undeletes a deleted document.
        """

        raise NotImplementedError

    @abstractmethod
    def is_deleted(self, docnum):
        """
        Returns True if the given document number is deleted.
        """

        raise NotImplementedError

    def should_assemble(self):
        return True


# Wrapping Segment

class WrappingSegment(Segment):
    def __init__(self, child):
        self._child = child

    def codec(self):
        return self._child.codec()

    def index_name(self):
        return self._child.index_name()

    def segment_id(self):
        return self._child.segment_id()

    def is_compound(self):
        return self._child.is_compound()

    def should_assemble(self):
        return self._child.should_assemble()

    def make_filename(self, ext):
        return self._child.make_filename(ext)

    def list_files(self, storage):
        return self._child.list_files(storage)

    def create_file(self, storage, ext, **kwargs):
        return self._child.create_file(storage, ext, **kwargs)

    def open_file(self, storage, ext, **kwargs):
        return self._child.open_file(storage, ext, **kwargs)

    def create_compound_file(self, storage):
        return self._child.create_compound_file(storage)

    def open_compound_file(self, storage):
        return self._child.open_compound_file(storage)

    def delete_document(self, docnum, delete=True):
        return self._child.delete_document(docnum, delete=delete)

    def has_deletions(self):
        return self._child.has_deletions()

    def deleted_count(self):
        return self._child.deleted_count()

    def deleted_docs(self):
        return self._child.deleted_docs()

    def is_deleted(self, docnum):
        return self._child.is_deleted(docnum)

    def set_doc_count(self, doccount):
        self._child.set_doc_count(doccount)

    def doc_count(self):
        return self._child.doc_count()

    def doc_count_all(self):
        return self._child.doc_count_all()


# Multi per doc reader

class MultiPerDocumentReader(PerDocumentReader):
    def __init__(self, readers, offset=0):
        self._readers = readers

        self._doc_offsets = []
        self._doccount = 0
        for pdr in readers:
            self._doc_offsets.append(self._doccount)
            self._doccount += pdr.doc_count_all()

        self.is_closed = False

    def close(self):
        for r in self._readers:
            r.close()
        self.is_closed = True

    def doc_count_all(self):
        return self._doccount

    def doc_count(self):
        total = 0
        for r in self._readers:
            total += r.doc_count()
        return total

    def _document_reader(self, docnum):
        return max(0, bisect_right(self._doc_offsets, docnum) - 1)

    def _reader_and_docnum(self, docnum):
        rnum = self._document_reader(docnum)
        offset = self._doc_offsets[rnum]
        return rnum, docnum - offset

    # Deletions

    def has_deletions(self):
        return any(r.has_deletions() for r in self._readers)

    def is_deleted(self, docnum):
        x, y = self._reader_and_docnum(docnum)
        return self._readers[x].is_deleted(y)

    def deleted_docs(self):
        for r, offset in izip(self._readers, self._doc_offsets):
            for docnum in r.deleted_docs():
                yield docnum + offset

    def all_doc_ids(self):
        for r, offset in izip(self._readers, self._doc_offsets):
            for docnum in r.all_doc_ids():
                yield docnum + offset

    # Columns

    def has_column(self, fieldname):
        return any(r.has_column(fieldname) for r in self._readers)

    def column_reader(self, fieldname, column):
        if not self.has_column(fieldname):
            raise ValueError("No column %r" % (fieldname,))

        default = column.default_value()
        colreaders = []
        for r in self._readers:
            if r.has_column(fieldname):
                cr = r.column_reader(fieldname, column)
            else:
                cr = columns.EmptyColumnReader(default, r.doc_count_all())
            colreaders.append(cr)

        if len(colreaders) == 1:
            return colreaders[0]
        else:
            return columns.MultiColumnReader(colreaders)

    # Lengths

    def doc_field_length(self, docnum, fieldname, default=0):
        x, y = self._reader_and_docnum(docnum)
        return self._readers[x].doc_field_length(y, fieldname, default)

    def field_length(self, fieldname):
        total = 0
        for r in self._readers:
            total += r.field_length(fieldname)
        return total

    def min_field_length(self):
        return min(r.min_field_length() for r in self._readers)

    def max_field_length(self):
        return max(r.max_field_length() for r in self._readers)


# Extended base classes

class PerDocWriterWithColumns(PerDocumentWriter):
    def __init__(self):
        PerDocumentWriter.__init__(self)
        # Implementations need to set these attributes
        self._storage = None
        self._segment = None
        self._docnum = None

    @abstractmethod
    def _has_column(self, fieldname):
        raise NotImplementedError

    @abstractmethod
    def _create_column(self, fieldname, column):
        raise NotImplementedError

    @abstractmethod
    def _get_column(self, fieldname):
        raise NotImplementedError

    def add_column_value(self, fieldname, column, value):
        if not self._has_column(fieldname):
            self._create_column(fieldname, column)
        self._get_column(fieldname).add(self._docnum, value)


# FieldCursor implementations

class EmptyCursor(FieldCursor):
    def first(self):
        return None

    def find(self, term):
        return None

    def next(self):
        return None

    def text(self):
        return None

    def term_info(self):
        return None

    def is_valid(self):
        return False