1# Copyright 2011 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28""" 29This module contains base classes/interfaces for "codec" objects. 30""" 31 32from bisect import bisect_right 33 34from whoosh import columns 35from whoosh.automata import lev 36from whoosh.compat import abstractmethod, izip, unichr, xrange 37from whoosh.filedb.compound import CompoundStorage 38from whoosh.system import emptybytes 39from whoosh.util import random_name 40 41 42# Exceptions 43 44class OutOfOrderError(Exception): 45 pass 46 47 48# Base classes 49 50class Codec(object): 51 length_stats = True 52 53 # Per document value writer 54 55 @abstractmethod 56 def per_document_writer(self, storage, segment): 57 raise NotImplementedError 58 59 # Inverted index writer 60 61 @abstractmethod 62 def field_writer(self, storage, segment): 63 raise NotImplementedError 64 65 # Postings 66 67 @abstractmethod 68 def postings_writer(self, dbfile, byteids=False): 69 raise NotImplementedError 70 71 @abstractmethod 72 def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): 73 raise NotImplementedError 74 75 # Index readers 76 77 def automata(self, storage, segment): 78 return Automata() 79 80 @abstractmethod 81 def terms_reader(self, storage, segment): 82 raise NotImplementedError 83 84 @abstractmethod 85 def per_document_reader(self, storage, segment): 86 raise NotImplementedError 87 88 # Segments and generations 89 90 @abstractmethod 91 def new_segment(self, storage, indexname): 92 raise NotImplementedError 93 94 95class WrappingCodec(Codec): 96 def __init__(self, child): 97 self._child = child 98 99 def per_document_writer(self, storage, segment): 100 return self._child.per_document_writer(storage, segment) 101 102 def field_writer(self, storage, segment): 103 return self._child.field_writer(storage, segment) 104 105 def postings_writer(self, dbfile, byteids=False): 106 return self._child.postings_writer(dbfile, byteids=byteids) 107 108 def postings_reader(self, dbfile, terminfo, format_, term=None, scorer=None): 109 return self._child.postings_reader(dbfile, terminfo, format_, term=term, 110 scorer=scorer) 111 112 def automata(self, storage, segment): 113 return self._child.automata(storage, segment) 114 115 def terms_reader(self, storage, segment): 116 return self._child.terms_reader(storage, segment) 117 118 def per_document_reader(self, storage, segment): 119 return self._child.per_document_reader(storage, segment) 120 121 def new_segment(self, storage, indexname): 122 return self._child.new_segment(storage, indexname) 123 124 125# Writer classes 126 127class PerDocumentWriter(object): 128 @abstractmethod 129 def start_doc(self, docnum): 130 raise NotImplementedError 131 132 @abstractmethod 133 def add_field(self, fieldname, fieldobj, value, length): 134 raise NotImplementedError 135 136 @abstractmethod 137 def add_column_value(self, fieldname, columnobj, value): 138 raise NotImplementedError("Codec does not implement writing columns") 139 140 @abstractmethod 141 def add_vector_items(self, fieldname, fieldobj, items): 142 raise NotImplementedError 143 144 def add_vector_matcher(self, fieldname, fieldobj, vmatcher): 145 def readitems(): 146 while vmatcher.is_active(): 147 text = vmatcher.id() 148 weight = vmatcher.weight() 149 valuestring = vmatcher.value() 150 yield (text, weight, valuestring) 151 vmatcher.next() 152 self.add_vector_items(fieldname, fieldobj, readitems()) 153 154 def finish_doc(self): 155 pass 156 157 def close(self): 158 pass 159 160 161class FieldWriter(object): 162 def add_postings(self, schema, lengths, items): 163 # This method translates a generator of (fieldname, btext, docnum, w, v) 164 # postings into calls to start_field(), start_term(), add(), 165 # finish_term(), finish_field(), etc. 166 167 start_field = self.start_field 168 start_term = self.start_term 169 add = self.add 170 finish_term = self.finish_term 171 finish_field = self.finish_field 172 173 if lengths: 174 dfl = lengths.doc_field_length 175 else: 176 dfl = lambda docnum, fieldname: 0 177 178 # The fieldname of the previous posting 179 lastfn = None 180 # The bytes text of the previous posting 181 lasttext = None 182 # The (fieldname, btext) of the previous spelling posting 183 lastspell = None 184 # The field object for the current field 185 fieldobj = None 186 for fieldname, btext, docnum, weight, value in items: 187 # Check for out-of-order postings. This is convoluted because Python 188 # 3 removed the ability to compare a string to None 189 if lastfn is not None and fieldname < lastfn: 190 raise OutOfOrderError("Field %r .. %r" % (lastfn, fieldname)) 191 if fieldname == lastfn and lasttext and btext < lasttext: 192 raise OutOfOrderError("Term %s:%r .. %s:%r" 193 % (lastfn, lasttext, fieldname, btext)) 194 195 # If the fieldname of this posting is different from the last one, 196 # tell the writer we're starting a new field 197 if fieldname != lastfn: 198 if lasttext is not None: 199 finish_term() 200 if lastfn is not None and fieldname != lastfn: 201 finish_field() 202 fieldobj = schema[fieldname] 203 start_field(fieldname, fieldobj) 204 lastfn = fieldname 205 lasttext = None 206 207 # HACK: items where docnum == -1 indicate words that should be added 208 # to the spelling graph, not the postings 209 if docnum == -1: 210 # spellterm = (fieldname, btext) 211 # # There can be duplicates of spelling terms, so only add a spell 212 # # term if it's greater than the last one 213 # if lastspell is None or spellterm > lastspell: 214 # spellword = fieldobj.from_bytes(btext) 215 # self.add_spell_word(fieldname, spellword) 216 # lastspell = spellterm 217 continue 218 219 # If this term is different from the term in the previous posting, 220 # tell the writer to start a new term 221 if btext != lasttext: 222 if lasttext is not None: 223 finish_term() 224 start_term(btext) 225 lasttext = btext 226 227 # Add this posting 228 length = dfl(docnum, fieldname) 229 if value is None: 230 value = emptybytes 231 add(docnum, weight, value, length) 232 233 if lasttext is not None: 234 finish_term() 235 if lastfn is not None: 236 finish_field() 237 238 @abstractmethod 239 def start_field(self, fieldname, fieldobj): 240 raise NotImplementedError 241 242 @abstractmethod 243 def start_term(self, text): 244 raise NotImplementedError 245 246 @abstractmethod 247 def add(self, docnum, weight, vbytes, length): 248 raise NotImplementedError 249 250 def add_spell_word(self, fieldname, text): 251 raise NotImplementedError 252 253 @abstractmethod 254 def finish_term(self): 255 raise NotImplementedError 256 257 def finish_field(self): 258 pass 259 260 def close(self): 261 pass 262 263 264# Postings 265 266class PostingsWriter(object): 267 @abstractmethod 268 def start_postings(self, format_, terminfo): 269 raise NotImplementedError 270 271 @abstractmethod 272 def add_posting(self, id_, weight, vbytes, length=None): 273 raise NotImplementedError 274 275 def finish_postings(self): 276 pass 277 278 @abstractmethod 279 def written(self): 280 """Returns True if this object has already written to disk. 281 """ 282 283 raise NotImplementedError 284 285 286# Reader classes 287 288class FieldCursor(object): 289 def first(self): 290 raise NotImplementedError 291 292 def find(self, string): 293 raise NotImplementedError 294 295 def next(self): 296 raise NotImplementedError 297 298 def term(self): 299 raise NotImplementedError 300 301 302class TermsReader(object): 303 @abstractmethod 304 def __contains__(self, term): 305 raise NotImplementedError 306 307 @abstractmethod 308 def cursor(self, fieldname, fieldobj): 309 raise NotImplementedError 310 311 @abstractmethod 312 def terms(self): 313 raise NotImplementedError 314 315 @abstractmethod 316 def terms_from(self, fieldname, prefix): 317 raise NotImplementedError 318 319 @abstractmethod 320 def items(self): 321 raise NotImplementedError 322 323 @abstractmethod 324 def items_from(self, fieldname, prefix): 325 raise NotImplementedError 326 327 @abstractmethod 328 def term_info(self, fieldname, text): 329 raise NotImplementedError 330 331 @abstractmethod 332 def frequency(self, fieldname, text): 333 return self.term_info(fieldname, text).weight() 334 335 @abstractmethod 336 def doc_frequency(self, fieldname, text): 337 return self.term_info(fieldname, text).doc_frequency() 338 339 @abstractmethod 340 def matcher(self, fieldname, text, format_, scorer=None): 341 raise NotImplementedError 342 343 @abstractmethod 344 def indexed_field_names(self): 345 raise NotImplementedError 346 347 def close(self): 348 pass 349 350 351class Automata(object): 352 @staticmethod 353 def levenshtein_dfa(uterm, maxdist, prefix=0): 354 return lev.levenshtein_automaton(uterm, maxdist, prefix).to_dfa() 355 356 @staticmethod 357 def find_matches(dfa, cur): 358 unull = unichr(0) 359 360 term = cur.text() 361 if term is None: 362 return 363 364 match = dfa.next_valid_string(term) 365 while match: 366 cur.find(match) 367 term = cur.text() 368 if term is None: 369 return 370 if match == term: 371 yield match 372 term += unull 373 match = dfa.next_valid_string(term) 374 375 def terms_within(self, fieldcur, uterm, maxdist, prefix=0): 376 dfa = self.levenshtein_dfa(uterm, maxdist, prefix) 377 return self.find_matches(dfa, fieldcur) 378 379 380# Per-doc value reader 381 382class PerDocumentReader(object): 383 def close(self): 384 pass 385 386 @abstractmethod 387 def doc_count(self): 388 raise NotImplementedError 389 390 @abstractmethod 391 def doc_count_all(self): 392 raise NotImplementedError 393 394 # Deletions 395 396 @abstractmethod 397 def has_deletions(self): 398 raise NotImplementedError 399 400 @abstractmethod 401 def is_deleted(self, docnum): 402 raise NotImplementedError 403 404 @abstractmethod 405 def deleted_docs(self): 406 raise NotImplementedError 407 408 def all_doc_ids(self): 409 """ 410 Returns an iterator of all (undeleted) document IDs in the reader. 411 """ 412 413 is_deleted = self.is_deleted 414 return (docnum for docnum in xrange(self.doc_count_all()) 415 if not is_deleted(docnum)) 416 417 def iter_docs(self): 418 for docnum in self.all_doc_ids(): 419 yield docnum, self.stored_fields(docnum) 420 421 # Columns 422 423 def supports_columns(self): 424 return False 425 426 def has_column(self, fieldname): 427 return False 428 429 def list_columns(self): 430 raise NotImplementedError 431 432 # Don't need to override this if supports_columns() returns False 433 def column_reader(self, fieldname, column): 434 raise NotImplementedError 435 436 # Bitmaps 437 438 def field_docs(self, fieldname): 439 return None 440 441 # Lengths 442 443 @abstractmethod 444 def doc_field_length(self, docnum, fieldname, default=0): 445 raise NotImplementedError 446 447 @abstractmethod 448 def field_length(self, fieldname): 449 raise NotImplementedError 450 451 @abstractmethod 452 def min_field_length(self, fieldname): 453 raise NotImplementedError 454 455 @abstractmethod 456 def max_field_length(self, fieldname): 457 raise NotImplementedError 458 459 # Vectors 460 461 def has_vector(self, docnum, fieldname): 462 return False 463 464 # Don't need to override this if has_vector() always returns False 465 def vector(self, docnum, fieldname, format_): 466 raise NotImplementedError 467 468 # Stored 469 470 @abstractmethod 471 def stored_fields(self, docnum): 472 raise NotImplementedError 473 474 def all_stored_fields(self): 475 for docnum in self.all_doc_ids(): 476 yield self.stored_fields(docnum) 477 478 479# Segment base class 480 481class Segment(object): 482 """Do not instantiate this object directly. It is used by the Index object 483 to hold information about a segment. A list of objects of this class are 484 pickled as part of the TOC file. 485 486 The TOC file stores a minimal amount of information -- mostly a list of 487 Segment objects. Segments are the real reverse indexes. Having multiple 488 segments allows quick incremental indexing: just create a new segment for 489 the new documents, and have the index overlay the new segment over previous 490 ones for purposes of reading/search. "Optimizing" the index combines the 491 contents of existing segments into one (removing any deleted documents 492 along the way). 493 """ 494 495 # Extension for compound segment files 496 COMPOUND_EXT = ".seg" 497 498 # self.indexname 499 # self.segid 500 501 def __init__(self, indexname): 502 self.indexname = indexname 503 self.segid = self._random_id() 504 self.compound = False 505 506 @classmethod 507 def _random_id(cls, size=16): 508 return random_name(size=size) 509 510 def __repr__(self): 511 return "<%s %s>" % (self.__class__.__name__, self.segment_id()) 512 513 def codec(self): 514 raise NotImplementedError 515 516 def index_name(self): 517 return self.indexname 518 519 def segment_id(self): 520 if hasattr(self, "name"): 521 # Old segment class 522 return self.name 523 else: 524 return "%s_%s" % (self.index_name(), self.segid) 525 526 def is_compound(self): 527 if not hasattr(self, "compound"): 528 return False 529 return self.compound 530 531 # File convenience methods 532 533 def make_filename(self, ext): 534 return "%s%s" % (self.segment_id(), ext) 535 536 def list_files(self, storage): 537 prefix = "%s." % self.segment_id() 538 return [name for name in storage.list() if name.startswith(prefix)] 539 540 def create_file(self, storage, ext, **kwargs): 541 """Convenience method to create a new file in the given storage named 542 with this segment's ID and the given extension. Any keyword arguments 543 are passed to the storage's create_file method. 544 """ 545 546 fname = self.make_filename(ext) 547 return storage.create_file(fname, **kwargs) 548 549 def open_file(self, storage, ext, **kwargs): 550 """Convenience method to open a file in the given storage named with 551 this segment's ID and the given extension. Any keyword arguments are 552 passed to the storage's open_file method. 553 """ 554 555 fname = self.make_filename(ext) 556 return storage.open_file(fname, **kwargs) 557 558 def create_compound_file(self, storage): 559 segfiles = self.list_files(storage) 560 assert not any(name.endswith(self.COMPOUND_EXT) for name in segfiles) 561 cfile = self.create_file(storage, self.COMPOUND_EXT) 562 CompoundStorage.assemble(cfile, storage, segfiles) 563 for name in segfiles: 564 storage.delete_file(name) 565 self.compound = True 566 567 def open_compound_file(self, storage): 568 name = self.make_filename(self.COMPOUND_EXT) 569 dbfile = storage.open_file(name) 570 return CompoundStorage(dbfile, use_mmap=storage.supports_mmap) 571 572 # Abstract methods 573 574 @abstractmethod 575 def doc_count_all(self): 576 """ 577 Returns the total number of documents, DELETED OR UNDELETED, in this 578 segment. 579 """ 580 581 raise NotImplementedError 582 583 def doc_count(self): 584 """ 585 Returns the number of (undeleted) documents in this segment. 586 """ 587 588 return self.doc_count_all() - self.deleted_count() 589 590 def set_doc_count(self, doccount): 591 raise NotImplementedError 592 593 def has_deletions(self): 594 """ 595 Returns True if any documents in this segment are deleted. 596 """ 597 598 return self.deleted_count() > 0 599 600 @abstractmethod 601 def deleted_count(self): 602 """ 603 Returns the total number of deleted documents in this segment. 604 """ 605 606 raise NotImplementedError 607 608 @abstractmethod 609 def deleted_docs(self): 610 raise NotImplementedError 611 612 @abstractmethod 613 def delete_document(self, docnum, delete=True): 614 """Deletes the given document number. The document is not actually 615 removed from the index until it is optimized. 616 617 :param docnum: The document number to delete. 618 :param delete: If False, this undeletes a deleted document. 619 """ 620 621 raise NotImplementedError 622 623 @abstractmethod 624 def is_deleted(self, docnum): 625 """ 626 Returns True if the given document number is deleted. 627 """ 628 629 raise NotImplementedError 630 631 def should_assemble(self): 632 return True 633 634 635# Wrapping Segment 636 637class WrappingSegment(Segment): 638 def __init__(self, child): 639 self._child = child 640 641 def codec(self): 642 return self._child.codec() 643 644 def index_name(self): 645 return self._child.index_name() 646 647 def segment_id(self): 648 return self._child.segment_id() 649 650 def is_compound(self): 651 return self._child.is_compound() 652 653 def should_assemble(self): 654 return self._child.should_assemble() 655 656 def make_filename(self, ext): 657 return self._child.make_filename(ext) 658 659 def list_files(self, storage): 660 return self._child.list_files(storage) 661 662 def create_file(self, storage, ext, **kwargs): 663 return self._child.create_file(storage, ext, **kwargs) 664 665 def open_file(self, storage, ext, **kwargs): 666 return self._child.open_file(storage, ext, **kwargs) 667 668 def create_compound_file(self, storage): 669 return self._child.create_compound_file(storage) 670 671 def open_compound_file(self, storage): 672 return self._child.open_compound_file(storage) 673 674 def delete_document(self, docnum, delete=True): 675 return self._child.delete_document(docnum, delete=delete) 676 677 def has_deletions(self): 678 return self._child.has_deletions() 679 680 def deleted_count(self): 681 return self._child.deleted_count() 682 683 def deleted_docs(self): 684 return self._child.deleted_docs() 685 686 def is_deleted(self, docnum): 687 return self._child.is_deleted(docnum) 688 689 def set_doc_count(self, doccount): 690 self._child.set_doc_count(doccount) 691 692 def doc_count(self): 693 return self._child.doc_count() 694 695 def doc_count_all(self): 696 return self._child.doc_count_all() 697 698 699# Multi per doc reader 700 701class MultiPerDocumentReader(PerDocumentReader): 702 def __init__(self, readers, offset=0): 703 self._readers = readers 704 705 self._doc_offsets = [] 706 self._doccount = 0 707 for pdr in readers: 708 self._doc_offsets.append(self._doccount) 709 self._doccount += pdr.doc_count_all() 710 711 self.is_closed = False 712 713 def close(self): 714 for r in self._readers: 715 r.close() 716 self.is_closed = True 717 718 def doc_count_all(self): 719 return self._doccount 720 721 def doc_count(self): 722 total = 0 723 for r in self._readers: 724 total += r.doc_count() 725 return total 726 727 def _document_reader(self, docnum): 728 return max(0, bisect_right(self._doc_offsets, docnum) - 1) 729 730 def _reader_and_docnum(self, docnum): 731 rnum = self._document_reader(docnum) 732 offset = self._doc_offsets[rnum] 733 return rnum, docnum - offset 734 735 # Deletions 736 737 def has_deletions(self): 738 return any(r.has_deletions() for r in self._readers) 739 740 def is_deleted(self, docnum): 741 x, y = self._reader_and_docnum(docnum) 742 return self._readers[x].is_deleted(y) 743 744 def deleted_docs(self): 745 for r, offset in izip(self._readers, self._doc_offsets): 746 for docnum in r.deleted_docs(): 747 yield docnum + offset 748 749 def all_doc_ids(self): 750 for r, offset in izip(self._readers, self._doc_offsets): 751 for docnum in r.all_doc_ids(): 752 yield docnum + offset 753 754 # Columns 755 756 def has_column(self, fieldname): 757 return any(r.has_column(fieldname) for r in self._readers) 758 759 def column_reader(self, fieldname, column): 760 if not self.has_column(fieldname): 761 raise ValueError("No column %r" % (fieldname,)) 762 763 default = column.default_value() 764 colreaders = [] 765 for r in self._readers: 766 if r.has_column(fieldname): 767 cr = r.column_reader(fieldname, column) 768 else: 769 cr = columns.EmptyColumnReader(default, r.doc_count_all()) 770 colreaders.append(cr) 771 772 if len(colreaders) == 1: 773 return colreaders[0] 774 else: 775 return columns.MultiColumnReader(colreaders) 776 777 # Lengths 778 779 def doc_field_length(self, docnum, fieldname, default=0): 780 x, y = self._reader_and_docnum(docnum) 781 return self._readers[x].doc_field_length(y, fieldname, default) 782 783 def field_length(self, fieldname): 784 total = 0 785 for r in self._readers: 786 total += r.field_length(fieldname) 787 return total 788 789 def min_field_length(self): 790 return min(r.min_field_length() for r in self._readers) 791 792 def max_field_length(self): 793 return max(r.max_field_length() for r in self._readers) 794 795 796# Extended base classes 797 798class PerDocWriterWithColumns(PerDocumentWriter): 799 def __init__(self): 800 PerDocumentWriter.__init__(self) 801 # Implementations need to set these attributes 802 self._storage = None 803 self._segment = None 804 self._docnum = None 805 806 @abstractmethod 807 def _has_column(self, fieldname): 808 raise NotImplementedError 809 810 @abstractmethod 811 def _create_column(self, fieldname, column): 812 raise NotImplementedError 813 814 @abstractmethod 815 def _get_column(self, fieldname): 816 raise NotImplementedError 817 818 def add_column_value(self, fieldname, column, value): 819 if not self._has_column(fieldname): 820 self._create_column(fieldname, column) 821 self._get_column(fieldname).add(self._docnum, value) 822 823 824# FieldCursor implementations 825 826class EmptyCursor(FieldCursor): 827 def first(self): 828 return None 829 830 def find(self, term): 831 return None 832 833 def next(self): 834 return None 835 836 def text(self): 837 return None 838 839 def term_info(self): 840 return None 841 842 def is_valid(self): 843 return False 844