1# Copyright 2007 Matt Chaput. All rights reserved. 2# 3# Redistribution and use in source and binary forms, with or without 4# modification, are permitted provided that the following conditions are met: 5# 6# 1. Redistributions of source code must retain the above copyright notice, 7# this list of conditions and the following disclaimer. 8# 9# 2. Redistributions in binary form must reproduce the above copyright 10# notice, this list of conditions and the following disclaimer in the 11# documentation and/or other materials provided with the distribution. 12# 13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR 14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF 15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO 16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, 17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT 18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, 19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, 22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 23# 24# The views and conclusions contained in the software and documentation are 25# those of the authors and should not be interpreted as representing official 26# policies, either expressed or implied, of Matt Chaput. 27 28""" 29 Contains functions and classes related to fields. 30""" 31 32import datetime, fnmatch, re, struct, sys 33from array import array 34from decimal import Decimal 35 36from whoosh import analysis, columns, formats 37from whoosh.compat import with_metaclass 38from whoosh.compat import itervalues, xrange 39from whoosh.compat import bytes_type, string_type, text_type 40from whoosh.system import emptybytes 41from whoosh.system import pack_byte, unpack_byte 42from whoosh.util.numeric import to_sortable, from_sortable 43from whoosh.util.numeric import typecode_max, NaN 44from whoosh.util.text import utf8encode, utf8decode 45from whoosh.util.times import datetime_to_long, long_to_datetime 46 47 48# Exceptions 49 50class FieldConfigurationError(Exception): 51 pass 52 53 54class UnknownFieldError(Exception): 55 pass 56 57 58# Field Types 59 60class FieldType(object): 61 """ 62 Represents a field configuration. 63 64 The FieldType object supports the following attributes: 65 66 * format (formats.Format): the storage format for posting blocks. 67 68 * analyzer (analysis.Analyzer): the analyzer to use to turn text into 69 terms. 70 71 * scorable (boolean): whether searches against this field may be scored. 72 This controls whether the index stores per-document field lengths for 73 this field. 74 75 * stored (boolean): whether the content of this field is stored for each 76 document. For example, in addition to indexing the title of a document, 77 you usually want to store the title so it can be presented as part of 78 the search results. 79 80 * unique (boolean): whether this field's value is unique to each document. 81 For example, 'path' or 'ID'. IndexWriter.update_document() will use 82 fields marked as 'unique' to find the previous version of a document 83 being updated. 84 85 * multitoken_query is a string indicating what kind of query to use when 86 a "word" in a user query parses into multiple tokens. The string is 87 interpreted by the query parser. The strings understood by the default 88 query parser are "first" (use first token only), "and" (join the tokens 89 with an AND query), "or" (join the tokens with OR), "phrase" (join 90 the tokens with a phrase query), and "default" (use the query parser's 91 default join type). 92 93 * vector (formats.Format or boolean): the format to use to store term 94 vectors. If not a ``Format`` object, any true value means to use the 95 index format as the term vector format. Any flase value means don't 96 store term vectors for this field. 97 98 The constructor for the base field type simply lets you supply your own 99 attribute values. Subclasses may configure some or all of this for you. 100 """ 101 102 analyzer = format = scorable = stored = unique = vector = None 103 indexed = True 104 multitoken_query = "default" 105 sortable_typecode = None 106 column_type = None 107 108 def __init__(self, format, analyzer, scorable=False, 109 stored=False, unique=False, multitoken_query="default", 110 sortable=False, vector=None): 111 self.format = format 112 self.analyzer = analyzer 113 self.scorable = scorable 114 self.stored = stored 115 self.unique = unique 116 self.multitoken_query = multitoken_query 117 self.set_sortable(sortable) 118 119 if isinstance(vector, formats.Format): 120 self.vector = vector 121 elif vector: 122 self.vector = self.format 123 else: 124 self.vector = None 125 126 def __repr__(self): 127 return ("%s(format=%r, scorable=%s, stored=%s, unique=%s)" 128 % (self.__class__.__name__, self.format, self.scorable, 129 self.stored, self.unique)) 130 131 def __eq__(self, other): 132 return all((isinstance(other, FieldType), 133 (self.format == other.format), 134 (self.scorable == other.scorable), 135 (self.stored == other.stored), 136 (self.unique == other.unique), 137 (self.column_type == other.column_type))) 138 139 def __ne__(self, other): 140 return not(self.__eq__(other)) 141 142 # Text 143 144 def index(self, value, **kwargs): 145 """Returns an iterator of (btext, frequency, weight, encoded_value) 146 tuples for each unique word in the input value. 147 148 The default implementation uses the ``analyzer`` attribute to tokenize 149 the value into strings, then encodes them into bytes using UTF-8. 150 """ 151 152 if not self.format: 153 raise Exception("%s field %r cannot index without a format" 154 % (self.__class__.__name__, self)) 155 if not isinstance(value, (text_type, list, tuple)): 156 raise ValueError("%r is not unicode or sequence" % value) 157 assert isinstance(self.format, formats.Format) 158 159 if "mode" not in kwargs: 160 kwargs["mode"] = "index" 161 162 word_values = self.format.word_values 163 ana = self.analyzer 164 for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs): 165 yield (utf8encode(tstring)[0], freq, wt, vbytes) 166 167 def tokenize(self, value, **kwargs): 168 """ 169 Analyzes the given string and returns an iterator of Token objects 170 (note: for performance reasons, actually the same token yielded over 171 and over with different attributes). 172 """ 173 174 if not self.analyzer: 175 raise Exception("%s field has no analyzer" % self.__class__) 176 return self.analyzer(value, **kwargs) 177 178 def process_text(self, qstring, mode='', **kwargs): 179 """ 180 Analyzes the given string and returns an iterator of token texts. 181 182 >>> field = fields.TEXT() 183 >>> list(field.process_text("The ides of March")) 184 ["ides", "march"] 185 """ 186 187 if not self.format: 188 raise Exception("%s field has no format" % self) 189 return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs)) 190 191 # Conversion 192 193 def to_bytes(self, value): 194 """ 195 Returns a bytes representation of the given value, appropriate to be 196 written to disk. The default implementation assumes a unicode value and 197 encodes it using UTF-8. 198 """ 199 200 if isinstance(value, (list, tuple)): 201 value = value[0] 202 if not isinstance(value, bytes_type): 203 value = utf8encode(value)[0] 204 return value 205 206 def to_column_value(self, value): 207 """ 208 Returns an object suitable to be inserted into the document values 209 column for this field. The default implementation simply calls 210 ``self.to_bytes(value)``. 211 """ 212 213 return self.to_bytes(value) 214 215 def from_bytes(self, bs): 216 return utf8decode(bs)[0] 217 218 def from_column_value(self, value): 219 return self.from_bytes(value) 220 221 # Columns/sorting 222 223 def set_sortable(self, sortable): 224 if sortable: 225 if isinstance(sortable, columns.Column): 226 self.column_type = sortable 227 else: 228 self.column_type = self.default_column() 229 else: 230 self.column_type = None 231 232 def sortable_terms(self, ixreader, fieldname): 233 """ 234 Returns an iterator of the "sortable" tokens in the given reader and 235 field. These values can be used for sorting. The default implementation 236 simply returns all tokens in the field. 237 238 This can be overridden by field types such as NUMERIC where some values 239 in a field are not useful for sorting. 240 """ 241 242 return ixreader.lexicon(fieldname) 243 244 def default_column(self): 245 return columns.VarBytesColumn() 246 247 # Parsing 248 249 def self_parsing(self): 250 """ 251 Subclasses should override this method to return True if they want 252 the query parser to call the field's ``parse_query()`` method instead 253 of running the analyzer on text in this field. This is useful where 254 the field needs full control over how queries are interpreted, such 255 as in the numeric field type. 256 """ 257 258 return False 259 260 def parse_query(self, fieldname, qstring, boost=1.0): 261 """ 262 When ``self_parsing()`` returns True, the query parser will call 263 this method to parse basic query text. 264 """ 265 266 raise NotImplementedError(self.__class__.__name__) 267 268 def parse_range(self, fieldname, start, end, startexcl, endexcl, 269 boost=1.0): 270 """ 271 When ``self_parsing()`` returns True, the query parser will call 272 this method to parse range query text. If this method returns None 273 instead of a query object, the parser will fall back to parsing the 274 start and end terms using process_text(). 275 """ 276 277 return None 278 279 # Spelling 280 281 def separate_spelling(self): 282 """ 283 Returns True if the field stores unstemmed words in a separate field for 284 spelling suggestions. 285 """ 286 287 return False 288 289 def spelling_fieldname(self, fieldname): 290 """ 291 Returns the name of a field to use for spelling suggestions instead of 292 this field. 293 294 :param fieldname: the name of this field. 295 """ 296 297 return fieldname 298 299 def spellable_words(self, value): 300 """Returns an iterator of each unique word (in sorted order) in the 301 input value, suitable for inclusion in the field's word graph. 302 303 The default behavior is to call the field analyzer with the keyword 304 argument ``no_morph=True``, which should make the analyzer skip any 305 morphological transformation filters (e.g. stemming) to preserve the 306 original form of the words. Exotic field types may need to override 307 this behavior. 308 """ 309 310 if isinstance(value, (list, tuple)): 311 words = value 312 else: 313 words = [token.text for token 314 in self.analyzer(value, no_morph=True)] 315 316 return iter(sorted(set(words))) 317 318 # Utility 319 320 def subfields(self): 321 """ 322 Returns an iterator of ``(name_prefix, fieldobject)`` pairs for the 323 fields that need to be indexed when content is put in this field. The 324 default implementation simply yields ``("", self)``. 325 """ 326 327 yield "", self 328 329 def supports(self, name): 330 """ 331 Returns True if the underlying format supports the given posting 332 value type. 333 334 >>> field = TEXT() 335 >>> field.supports("positions") 336 True 337 >>> field.supports("chars") 338 False 339 """ 340 341 return self.format.supports(name) 342 343 def clean(self): 344 """ 345 Clears any cached information in the field and any child objects. 346 """ 347 348 if self.format and hasattr(self.format, "clean"): 349 self.format.clean() 350 351 # Events 352 353 def on_add(self, schema, fieldname): 354 pass 355 356 def on_remove(self, schema, fieldname): 357 pass 358 359 360# Wrapper base class 361 362class FieldWrapper(FieldType): 363 def __init__(self, subfield, prefix): 364 if isinstance(subfield, type): 365 subfield = subfield() 366 self.subfield = subfield 367 self.name_prefix = prefix 368 369 # By default we'll copy all the subfield's attributes -- override these 370 # in subclass constructor for things you want to change 371 self.analyzer = subfield.analyzer 372 self.format = subfield.format 373 self.column_type = subfield.column_type 374 self.scorable = subfield.scorable 375 self.stored = subfield.stored 376 self.unique = subfield.unique 377 self.indexed = subfield.indexed 378 self.vector = subfield.vector 379 380 def __eq__(self, other): 381 return self.subfield.__eq__(other) 382 383 def __ne__(self, other): 384 return self.subfield.__ne__(other) 385 386 # Text 387 388 # def index(self, value, boost=1.0, **kwargs): 389 # return self.subfield.index(value, boost, **kwargs) 390 # 391 # def tokenize(self, value, **kwargs): 392 # return self.subfield.tokenize(value, **kwargs) 393 # 394 # def process_text(self, qstring, mode='', **kwargs): 395 # return self.subfield.process_text(qstring, mode, **kwargs) 396 397 # Conversion 398 399 def to_bytes(self, value): 400 return self.subfield.to_bytes(value) 401 402 def to_column_value(self, value): 403 return self.subfield.to_column_value(value) 404 405 def from_bytes(self, bs): 406 return self.subfield.from_bytes(bs) 407 408 def from_column_value(self, value): 409 return self.subfield.from_column_value(value) 410 411 # Sorting/columns 412 413 def set_sortable(self, sortable): 414 self.subfield.set_sortable(sortable) 415 416 def sortable_terms(self, ixreader, fieldname): 417 return self.subfield.sortable_terms(ixreader, fieldname) 418 419 def default_column(self): 420 return self.subfield.default_column() 421 422 # Parsing 423 424 def self_parsing(self): 425 return self.subfield.self_parsing() 426 427 def parse_query(self, fieldname, qstring, boost=1.0): 428 return self.subfield.parse_query(fieldname, qstring, boost) 429 430 def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0): 431 self.subfield.parse_range(fieldname, start, end, startexcl, endexcl, 432 boost) 433 434 # Utility 435 436 def subfields(self): 437 # The default FieldWrapper.subfields() implementation DOES NOT split 438 # out the subfield here -- you need to override if that's what you want 439 yield "", self 440 441 def supports(self, name): 442 return self.subfield.supports(name) 443 444 def clean(self): 445 self.subfield.clean() 446 447 # Events 448 449 def on_add(self, schema, fieldname): 450 self.subfield.on_add(schema, fieldname) 451 452 def on_remove(self, schema, fieldname): 453 self.subfield.on_remove(schema, fieldname) 454 455 456# Pre-configured field types 457 458class ID(FieldType): 459 """ 460 Configured field type that indexes the entire value of the field as one 461 token. This is useful for data you don't want to tokenize, such as the path 462 of a file. 463 """ 464 465 def __init__(self, stored=False, unique=False, field_boost=1.0, 466 sortable=False, analyzer=None): 467 """ 468 :param stored: Whether the value of this field is stored with the 469 document. 470 """ 471 472 self.analyzer = analyzer or analysis.IDAnalyzer() 473 # Don't store any information other than the doc ID 474 self.format = formats.Existence(field_boost=field_boost) 475 self.stored = stored 476 self.unique = unique 477 self.set_sortable(sortable) 478 479 480class IDLIST(FieldType): 481 """ 482 Configured field type for fields containing IDs separated by whitespace 483 and/or punctuation (or anything else, using the expression param). 484 """ 485 486 def __init__(self, stored=False, unique=False, expression=None, 487 field_boost=1.0): 488 """ 489 :param stored: Whether the value of this field is stored with the 490 document. 491 :param unique: Whether the value of this field is unique per-document. 492 :param expression: The regular expression object to use to extract 493 tokens. The default expression breaks tokens on CRs, LFs, tabs, 494 spaces, commas, and semicolons. 495 """ 496 497 expression = expression or re.compile(r"[^\r\n\t ,;]+") 498 self.analyzer = analysis.RegexAnalyzer(expression=expression) 499 # Don't store any information other than the doc ID 500 self.format = formats.Existence(field_boost=field_boost) 501 self.stored = stored 502 self.unique = unique 503 504 505class NUMERIC(FieldType): 506 """ 507 Special field type that lets you index integer or floating point 508 numbers in relatively short fixed-width terms. The field converts numbers 509 to sortable bytes for you before indexing. 510 511 You specify the numeric type of the field (``int`` or ``float``) when you 512 create the ``NUMERIC`` object. The default is ``int``. For ``int``, you can 513 specify a size in bits (``32`` or ``64``). For both ``int`` and ``float`` 514 you can specify a ``signed`` keyword argument (default is ``True``). 515 516 >>> schema = Schema(path=STORED, position=NUMERIC(int, 64, signed=False)) 517 >>> ix = storage.create_index(schema) 518 >>> with ix.writer() as w: 519 ... w.add_document(path="/a", position=5820402204) 520 ... 521 522 You can also use the NUMERIC field to store Decimal instances by specifying 523 a type of ``int`` or ``long`` and the ``decimal_places`` keyword argument. 524 This simply multiplies each number by ``(10 ** decimal_places)`` before 525 storing it as an integer. Of course this may throw away decimal prcesision 526 (by truncating, not rounding) and imposes the same maximum value limits as 527 ``int``/``long``, but these may be acceptable for certain applications. 528 529 >>> from decimal import Decimal 530 >>> schema = Schema(path=STORED, position=NUMERIC(int, decimal_places=4)) 531 >>> ix = storage.create_index(schema) 532 >>> with ix.writer() as w: 533 ... w.add_document(path="/a", position=Decimal("123.45") 534 ... 535 536 """ 537 538 def __init__(self, numtype=int, bits=32, stored=False, unique=False, 539 field_boost=1.0, decimal_places=0, shift_step=4, signed=True, 540 sortable=False, default=None): 541 """ 542 :param numtype: the type of numbers that can be stored in this field, 543 either ``int``, ``float``. If you use ``Decimal``, 544 use the ``decimal_places`` argument to control how many decimal 545 places the field will store. 546 :param bits: When ``numtype`` is ``int``, the number of bits to use to 547 store the number: 8, 16, 32, or 64. 548 :param stored: Whether the value of this field is stored with the 549 document. 550 :param unique: Whether the value of this field is unique per-document. 551 :param decimal_places: specifies the number of decimal places to save 552 when storing Decimal instances. If you set this, you will always 553 get Decimal instances back from the field. 554 :param shift_steps: The number of bits of precision to shift away at 555 each tiered indexing level. Values should generally be 1-8. Lower 556 values yield faster searches but take up more space. A value 557 of `0` means no tiered indexing. 558 :param signed: Whether the numbers stored in this field may be 559 negative. 560 """ 561 562 # Allow users to specify strings instead of Python types in case 563 # docstring isn't clear 564 if numtype == "int": 565 numtype = int 566 if numtype == "float": 567 numtype = float 568 # Raise an error if the user tries to use a type other than int or 569 # float 570 if numtype is Decimal: 571 numtype = int 572 if not decimal_places: 573 raise TypeError("To store Decimal instances, you must set the " 574 "decimal_places argument") 575 elif numtype not in (int, float): 576 raise TypeError("Can't use %r as a type, use int or float" 577 % numtype) 578 # Sanity check 579 if numtype is float and decimal_places: 580 raise Exception("A float type and decimal_places argument %r are " 581 "incompatible" % decimal_places) 582 583 intsizes = [8, 16, 32, 64] 584 intcodes = ["B", "H", "I", "Q"] 585 # Set up field configuration based on type and size 586 if numtype is float: 587 bits = 64 # Floats are converted to 64 bit ints 588 else: 589 if bits not in intsizes: 590 raise Exception("Invalid bits %r, use 8, 16, 32, or 64" 591 % bits) 592 # Type code for the *sortable* representation 593 self.sortable_typecode = intcodes[intsizes.index(bits)] 594 self._struct = struct.Struct(">" + str(self.sortable_typecode)) 595 596 self.numtype = numtype 597 self.bits = bits 598 self.stored = stored 599 self.unique = unique 600 self.decimal_places = decimal_places 601 self.shift_step = shift_step 602 self.signed = signed 603 self.analyzer = analysis.IDAnalyzer() 604 # Don't store any information other than the doc ID 605 self.format = formats.Existence(field_boost=field_boost) 606 self.min_value, self.max_value = self._min_max() 607 608 # Column configuration 609 if default is None: 610 if numtype is int: 611 default = typecode_max[self.sortable_typecode] 612 else: 613 default = NaN 614 elif not self.is_valid(default): 615 raise Exception("The default %r is not a valid number for this " 616 "field" % default) 617 618 self.default = default 619 self.set_sortable(sortable) 620 621 def __getstate__(self): 622 d = self.__dict__.copy() 623 if "_struct" in d: 624 del d["_struct"] 625 return d 626 627 def __setstate__(self, d): 628 self.__dict__.update(d) 629 self._struct = struct.Struct(">" + str(self.sortable_typecode)) 630 if "min_value" not in d: 631 d["min_value"], d["max_value"] = self._min_max() 632 633 def _min_max(self): 634 numtype = self.numtype 635 bits = self.bits 636 signed = self.signed 637 638 # Calculate the minimum and maximum possible values for error checking 639 min_value = from_sortable(numtype, bits, signed, 0) 640 max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1) 641 642 return min_value, max_value 643 644 def default_column(self): 645 return columns.NumericColumn(self.sortable_typecode, 646 default=self.default) 647 648 def is_valid(self, x): 649 try: 650 x = self.to_bytes(x) 651 except ValueError: 652 return False 653 except OverflowError: 654 return False 655 656 return True 657 658 def index(self, num, **kwargs): 659 # If the user gave us a list of numbers, recurse on the list 660 if isinstance(num, (list, tuple)): 661 for n in num: 662 for item in self.index(n): 663 yield item 664 return 665 666 # word, freq, weight, valuestring 667 if self.shift_step: 668 for shift in xrange(0, self.bits, self.shift_step): 669 yield (self.to_bytes(num, shift), 1, 1.0, emptybytes) 670 else: 671 yield (self.to_bytes(num), 1, 1.0, emptybytes) 672 673 def prepare_number(self, x): 674 if x == emptybytes or x is None: 675 return x 676 677 dc = self.decimal_places 678 if dc and isinstance(x, (string_type, Decimal)): 679 x = Decimal(x) * (10 ** dc) 680 elif isinstance(x, Decimal): 681 raise TypeError("Can't index a Decimal object unless you specified " 682 "decimal_places on the field") 683 684 try: 685 x = self.numtype(x) 686 except OverflowError: 687 raise ValueError("Value %r overflowed number type %r" 688 % (x, self.numtype)) 689 690 if x < self.min_value or x > self.max_value: 691 raise ValueError("Numeric field value %s out of range [%s, %s]" 692 % (x, self.min_value, self.max_value)) 693 return x 694 695 def unprepare_number(self, x): 696 dc = self.decimal_places 697 if dc: 698 s = str(x) 699 x = Decimal(s[:-dc] + "." + s[-dc:]) 700 return x 701 702 def to_column_value(self, x): 703 if isinstance(x, (list, tuple, array)): 704 x = x[0] 705 x = self.prepare_number(x) 706 return to_sortable(self.numtype, self.bits, self.signed, x) 707 708 def from_column_value(self, x): 709 x = from_sortable(self.numtype, self.bits, self.signed, x) 710 return self.unprepare_number(x) 711 712 def to_bytes(self, x, shift=0): 713 # Try to avoid re-encoding; this sucks because on Python 2 we can't 714 # tell the difference between a string and encoded bytes, so we have 715 # to require the user use unicode when they mean string 716 if isinstance(x, bytes_type): 717 return x 718 719 if x == emptybytes or x is None: 720 return self.sortable_to_bytes(0) 721 722 x = self.prepare_number(x) 723 x = to_sortable(self.numtype, self.bits, self.signed, x) 724 return self.sortable_to_bytes(x, shift) 725 726 def sortable_to_bytes(self, x, shift=0): 727 if shift: 728 x >>= shift 729 return pack_byte(shift) + self._struct.pack(x) 730 731 def from_bytes(self, bs): 732 x = self._struct.unpack(bs[1:])[0] 733 x = from_sortable(self.numtype, self.bits, self.signed, x) 734 x = self.unprepare_number(x) 735 return x 736 737 def process_text(self, text, **kwargs): 738 return (self.to_bytes(text),) 739 740 def self_parsing(self): 741 return True 742 743 def parse_query(self, fieldname, qstring, boost=1.0): 744 from whoosh import query 745 from whoosh.qparser.common import QueryParserError 746 747 if qstring == "*": 748 return query.Every(fieldname, boost=boost) 749 750 if not self.is_valid(qstring): 751 raise QueryParserError("%r is not a valid number" % qstring) 752 753 token = self.to_bytes(qstring) 754 return query.Term(fieldname, token, boost=boost) 755 756 def parse_range(self, fieldname, start, end, startexcl, endexcl, 757 boost=1.0): 758 from whoosh import query 759 from whoosh.qparser.common import QueryParserError 760 761 if start is not None: 762 if not self.is_valid(start): 763 raise QueryParserError("Range start %r is not a valid number" 764 % start) 765 start = self.prepare_number(start) 766 if end is not None: 767 if not self.is_valid(end): 768 raise QueryParserError("Range end %r is not a valid number" 769 % end) 770 end = self.prepare_number(end) 771 return query.NumericRange(fieldname, start, end, startexcl, endexcl, 772 boost=boost) 773 774 def sortable_terms(self, ixreader, fieldname): 775 zero = b"\x00" 776 for token in ixreader.lexicon(fieldname): 777 if token[0:1] != zero: 778 # Only yield the full-precision values 779 break 780 yield token 781 782 783class DATETIME(NUMERIC): 784 """ 785 Special field type that lets you index datetime objects. The field 786 converts the datetime objects to sortable text for you before indexing. 787 788 Since this field is based on Python's datetime module it shares all the 789 limitations of that module, such as the inability to represent dates before 790 year 1 in the proleptic Gregorian calendar. However, since this field 791 stores datetimes as an integer number of microseconds, it could easily 792 represent a much wider range of dates if the Python datetime implementation 793 ever supports them. 794 795 >>> schema = Schema(path=STORED, date=DATETIME) 796 >>> ix = storage.create_index(schema) 797 >>> w = ix.writer() 798 >>> w.add_document(path="/a", date=datetime.now()) 799 >>> w.commit() 800 """ 801 802 def __init__(self, stored=False, unique=False, sortable=False): 803 """ 804 :param stored: Whether the value of this field is stored with the 805 document. 806 :param unique: Whether the value of this field is unique per-document. 807 """ 808 809 super(DATETIME, self).__init__(int, 64, stored=stored, 810 unique=unique, shift_step=8, 811 sortable=sortable) 812 813 def prepare_datetime(self, x): 814 from whoosh.util.times import floor 815 816 if isinstance(x, text_type): 817 # For indexing, support same strings as for query parsing -- 818 # convert unicode to datetime object 819 x = self._parse_datestring(x) 820 x = floor(x) # this makes most sense (unspecified = lowest) 821 822 if isinstance(x, datetime.datetime): 823 return datetime_to_long(x) 824 elif isinstance(x, bytes_type): 825 return x 826 else: 827 raise Exception("%r is not a datetime" % (x,)) 828 829 def to_column_value(self, x): 830 if isinstance(x, bytes_type): 831 raise Exception("%r is not a datetime" % (x,)) 832 if isinstance(x, (list, tuple)): 833 x = x[0] 834 return self.prepare_datetime(x) 835 836 def from_column_value(self, x): 837 return long_to_datetime(x) 838 839 def to_bytes(self, x, shift=0): 840 x = self.prepare_datetime(x) 841 return NUMERIC.to_bytes(self, x, shift=shift) 842 843 def from_bytes(self, bs): 844 x = NUMERIC.from_bytes(self, bs) 845 return long_to_datetime(x) 846 847 def _parse_datestring(self, qstring): 848 # This method parses a very simple datetime representation of the form 849 # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]] 850 from whoosh.util.times import adatetime, fix, is_void 851 852 qstring = qstring.replace(" ", "").replace("-", "").replace(".", "") 853 year = month = day = hour = minute = second = microsecond = None 854 if len(qstring) >= 4: 855 year = int(qstring[:4]) 856 if len(qstring) >= 6: 857 month = int(qstring[4:6]) 858 if len(qstring) >= 8: 859 day = int(qstring[6:8]) 860 if len(qstring) >= 10: 861 hour = int(qstring[8:10]) 862 if len(qstring) >= 12: 863 minute = int(qstring[10:12]) 864 if len(qstring) >= 14: 865 second = int(qstring[12:14]) 866 if len(qstring) == 20: 867 microsecond = int(qstring[14:]) 868 869 at = fix(adatetime(year, month, day, hour, minute, second, 870 microsecond)) 871 if is_void(at): 872 raise Exception("%r is not a parseable date" % qstring) 873 return at 874 875 def parse_query(self, fieldname, qstring, boost=1.0): 876 from whoosh import query 877 from whoosh.util.times import is_ambiguous 878 879 try: 880 at = self._parse_datestring(qstring) 881 except: 882 e = sys.exc_info()[1] 883 return query.error_query(e) 884 885 if is_ambiguous(at): 886 startnum = datetime_to_long(at.floor()) 887 endnum = datetime_to_long(at.ceil()) 888 return query.NumericRange(fieldname, startnum, endnum) 889 else: 890 return query.Term(fieldname, at, boost=boost) 891 892 def parse_range(self, fieldname, start, end, startexcl, endexcl, 893 boost=1.0): 894 from whoosh import query 895 896 if start is None and end is None: 897 return query.Every(fieldname, boost=boost) 898 899 if start is not None: 900 startdt = self._parse_datestring(start).floor() 901 start = datetime_to_long(startdt) 902 903 if end is not None: 904 enddt = self._parse_datestring(end).ceil() 905 end = datetime_to_long(enddt) 906 907 return query.NumericRange(fieldname, start, end, boost=boost) 908 909 910class BOOLEAN(FieldType): 911 """ 912 Special field type that lets you index boolean values (True and False). 913 The field converts the boolean values to text for you before indexing. 914 915 >>> schema = Schema(path=STORED, done=BOOLEAN) 916 >>> ix = storage.create_index(schema) 917 >>> w = ix.writer() 918 >>> w.add_document(path="/a", done=False) 919 >>> w.commit() 920 """ 921 922 bytestrings = (b"f", b"t") 923 trues = frozenset(u"t true yes 1".split()) 924 falses = frozenset(u"f false no 0".split()) 925 926 def __init__(self, stored=False, field_boost=1.0): 927 """ 928 :param stored: Whether the value of this field is stored with the 929 document. 930 """ 931 932 self.stored = stored 933 # Don't store any information other than the doc ID 934 self.format = formats.Existence(field_boost=field_boost) 935 936 def _obj_to_bool(self, x): 937 # We special case strings such as "true", "false", "yes", "no", but 938 # otherwise call bool() on the query value. This lets you pass objects 939 # as query values and do the right thing. 940 941 if isinstance(x, string_type) and x.lower() in self.trues: 942 x = True 943 elif isinstance(x, string_type) and x.lower() in self.falses: 944 x = False 945 else: 946 x = bool(x) 947 return x 948 949 def to_bytes(self, x): 950 if isinstance(x, bytes_type): 951 return x 952 elif isinstance(x, string_type): 953 x = x.lower() in self.trues 954 else: 955 x = bool(x) 956 bs = self.bytestrings[int(x)] 957 return bs 958 959 def index(self, bit, **kwargs): 960 if isinstance(bit, string_type): 961 bit = bit.lower() in self.trues 962 else: 963 bit = bool(bit) 964 # word, freq, weight, valuestring 965 return [(self.bytestrings[int(bit)], 1, 1.0, emptybytes)] 966 967 def self_parsing(self): 968 return True 969 970 def parse_query(self, fieldname, qstring, boost=1.0): 971 from whoosh import query 972 973 if qstring == "*": 974 return query.Every(fieldname, boost=boost) 975 976 return query.Term(fieldname, self._obj_to_bool(qstring), boost=boost) 977 978 979class STORED(FieldType): 980 """ 981 Configured field type for fields you want to store but not index. 982 """ 983 984 indexed = False 985 stored = True 986 987 def __init__(self): 988 pass 989 990 991class COLUMN(FieldType): 992 """ 993 Configured field type for fields you want to store as a per-document 994 value column but not index. 995 """ 996 997 indexed = False 998 stored = False 999 1000 def __init__(self, columnobj=None): 1001 if columnobj is None: 1002 columnobj = columns.VarBytesColumn() 1003 if not isinstance(columnobj, columns.Column): 1004 raise TypeError("%r is not a column object" % (columnobj,)) 1005 self.column_type = columnobj 1006 1007 def to_bytes(self, v): 1008 return v 1009 1010 def from_bytes(self, b): 1011 return b 1012 1013 1014class KEYWORD(FieldType): 1015 """ 1016 Configured field type for fields containing space-separated or 1017 comma-separated keyword-like data (such as tags). The default is to not 1018 store positional information (so phrase searching is not allowed in this 1019 field) and to not make the field scorable. 1020 """ 1021 1022 def __init__(self, stored=False, lowercase=False, commas=False, 1023 scorable=False, unique=False, field_boost=1.0, sortable=False, 1024 vector=None, analyzer=None): 1025 """ 1026 :param stored: Whether to store the value of the field with the 1027 document. 1028 :param commas: Whether this is a comma-separated field. If this is False 1029 (the default), it is treated as a space-separated field. 1030 :param scorable: Whether this field is scorable. 1031 """ 1032 1033 if not analyzer: 1034 analyzer = analysis.KeywordAnalyzer(lowercase=lowercase, 1035 commas=commas) 1036 self.analyzer = analyzer 1037 1038 # Store field lengths and weights along with doc ID 1039 self.format = formats.Frequency(field_boost=field_boost) 1040 self.scorable = scorable 1041 self.stored = stored 1042 self.unique = unique 1043 1044 if isinstance(vector, formats.Format): 1045 self.vector = vector 1046 elif vector: 1047 self.vector = self.format 1048 else: 1049 self.vector = None 1050 1051 if sortable: 1052 self.column_type = self.default_column() 1053 1054 1055class TEXT(FieldType): 1056 """ 1057 Configured field type for text fields (for example, the body text of an 1058 article). The default is to store positional information to allow phrase 1059 searching. This field type is always scorable. 1060 """ 1061 1062 def __init__(self, analyzer=None, phrase=True, chars=False, stored=False, 1063 field_boost=1.0, multitoken_query="default", spelling=False, 1064 sortable=False, lang=None, vector=None, 1065 spelling_prefix="spell_"): 1066 """ 1067 :param analyzer: The analysis.Analyzer to use to index the field 1068 contents. See the analysis module for more information. If you omit 1069 this argument, the field uses analysis.StandardAnalyzer. 1070 :param phrase: Whether the store positional information to allow phrase 1071 searching. 1072 :param chars: Whether to store character ranges along with positions. 1073 If this is True, "phrase" is also implied. 1074 :param stored: Whether to store the value of this field with the 1075 document. Since this field type generally contains a lot of text, 1076 you should avoid storing it with the document unless you need to, 1077 for example to allow fast excerpts in the search results. 1078 :param spelling: if True, and if the field's analyzer changes the form 1079 of term text (such as a stemming analyzer), this field will store 1080 extra information in a separate field (named using the 1081 ``spelling_prefix`` keyword argument) to allow spelling suggestions 1082 to use the unchanged word forms as spelling suggestions. 1083 :param sortable: If True, make this field sortable using the default 1084 column type. If you pass a :class:`whoosh.columns.Column` instance 1085 instead of True, the field will use the given column type. 1086 :param lang: automaticaly configure a 1087 :class:`whoosh.analysis.LanguageAnalyzer` for the given language. 1088 This is ignored if you also specify an ``analyzer``. 1089 :param vector: if this value evaluates to true, store a list of the 1090 terms in this field in each document. If the value is an instance 1091 of :class:`whoosh.formats.Format`, the index will use the object to 1092 store the term vector. Any other true value (e.g. ``vector=True``) 1093 will use the field's index format to store the term vector as well. 1094 """ 1095 1096 if analyzer: 1097 self.analyzer = analyzer 1098 elif lang: 1099 self.analyzer = analysis.LanguageAnalyzer(lang) 1100 else: 1101 self.analyzer = analysis.StandardAnalyzer() 1102 1103 if chars: 1104 formatclass = formats.Characters 1105 elif phrase: 1106 formatclass = formats.Positions 1107 else: 1108 formatclass = formats.Frequency 1109 self.format = formatclass(field_boost=field_boost) 1110 1111 if sortable: 1112 if isinstance(sortable, columns.Column): 1113 self.column_type = sortable 1114 else: 1115 self.column_type = columns.VarBytesColumn() 1116 else: 1117 self.column_type = None 1118 1119 self.spelling = spelling 1120 self.spelling_prefix = spelling_prefix 1121 self.multitoken_query = multitoken_query 1122 self.scorable = True 1123 self.stored = stored 1124 1125 if isinstance(vector, formats.Format): 1126 self.vector = vector 1127 elif vector: 1128 self.vector = self.format 1129 else: 1130 self.vector = None 1131 1132 def subfields(self): 1133 yield "", self 1134 1135 # If the user indicated this is a spellable field, and the analyzer 1136 # is morphic, then also index into a spelling-only field that stores 1137 # minimal information 1138 if self.separate_spelling(): 1139 yield self.spelling_prefix, SpellField(self.analyzer) 1140 1141 def separate_spelling(self): 1142 return self.spelling and self.analyzer.has_morph() 1143 1144 def spelling_fieldname(self, fieldname): 1145 if self.separate_spelling(): 1146 return self.spelling_prefix + fieldname 1147 else: 1148 return fieldname 1149 1150 1151class SpellField(FieldType): 1152 """ 1153 This is a utility field type meant to be returned by ``TEXT.subfields()`` 1154 when it needs a minimal field to store the spellable words. 1155 """ 1156 1157 def __init__(self, analyzer): 1158 self.format = formats.Frequency() 1159 self.analyzer = analyzer 1160 self.column_type = None 1161 self.scorabe = False 1162 self.stored = False 1163 self.unique = False 1164 self.indexed = True 1165 self.spelling = False 1166 1167 # All the text analysis methods add "nomorph" to the keywords to get 1168 # unmorphed term texts 1169 1170 def index(self, value, boost=1.0, **kwargs): 1171 kwargs["nomorph"] = True 1172 return FieldType.index(self, value, boost=boost, **kwargs) 1173 1174 def tokenzie(self, value, **kwargs): 1175 kwargs["nomorph"] = True 1176 return FieldType.tokenize(self, value, **kwargs) 1177 1178 def process_text(self, qstring, mode='', **kwargs): 1179 kwargs["nomorph"] = True 1180 return FieldType.process_text(self, qstring, mode=mode, **kwargs) 1181 1182 1183class NGRAM(FieldType): 1184 """ 1185 Configured field that indexes text as N-grams. For example, with a field 1186 type NGRAM(3,4), the value "hello" will be indexed as tokens 1187 "hel", "hell", "ell", "ello", "llo". This field type chops the entire text 1188 into N-grams, including whitespace and punctuation. See :class:`NGRAMWORDS` 1189 for a field type that breaks the text into words first before chopping the 1190 words into N-grams. 1191 """ 1192 1193 scorable = True 1194 1195 def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, 1196 queryor=False, phrase=False, sortable=False): 1197 """ 1198 :param minsize: The minimum length of the N-grams. 1199 :param maxsize: The maximum length of the N-grams. 1200 :param stored: Whether to store the value of this field with the 1201 document. Since this field type generally contains a lot of text, 1202 you should avoid storing it with the document unless you need to, 1203 for example to allow fast excerpts in the search results. 1204 :param queryor: if True, combine the N-grams with an Or query. The 1205 default is to combine N-grams with an And query. 1206 :param phrase: store positions on the N-grams to allow exact phrase 1207 searching. The default is off. 1208 """ 1209 1210 formatclass = formats.Frequency 1211 if phrase: 1212 formatclass = formats.Positions 1213 1214 self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) 1215 self.format = formatclass(field_boost=field_boost) 1216 self.analyzer = analysis.NgramAnalyzer(minsize, maxsize) 1217 self.stored = stored 1218 self.queryor = queryor 1219 self.set_sortable(sortable) 1220 1221 def self_parsing(self): 1222 return True 1223 1224 def parse_query(self, fieldname, qstring, boost=1.0): 1225 from whoosh import query 1226 1227 terms = [query.Term(fieldname, g) 1228 for g in self.process_text(qstring, mode='query')] 1229 cls = query.Or if self.queryor else query.And 1230 1231 return cls(terms, boost=boost) 1232 1233 1234class NGRAMWORDS(NGRAM): 1235 """ 1236 Configured field that chops text into words using a tokenizer, 1237 lowercases the words, and then chops the words into N-grams. 1238 """ 1239 1240 scorable = True 1241 1242 def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0, 1243 tokenizer=None, at=None, queryor=False, sortable=False): 1244 """ 1245 :param minsize: The minimum length of the N-grams. 1246 :param maxsize: The maximum length of the N-grams. 1247 :param stored: Whether to store the value of this field with the 1248 document. Since this field type generally contains a lot of text, 1249 you should avoid storing it with the document unless you need to, 1250 for example to allow fast excerpts in the search results. 1251 :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer` 1252 used to break the text into words. 1253 :param at: if 'start', only takes N-grams from the start of the word. 1254 If 'end', only takes N-grams from the end. Otherwise the default 1255 is to take all N-grams from each word. 1256 :param queryor: if True, combine the N-grams with an Or query. The 1257 default is to combine N-grams with an And query. 1258 """ 1259 1260 self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer, 1261 at=at) 1262 self.format = formats.Frequency(field_boost=field_boost) 1263 self.stored = stored 1264 self.queryor = queryor 1265 self.set_sortable(sortable) 1266 1267 1268# Other fields 1269 1270class ReverseField(FieldWrapper): 1271 def __init__(self, subfield, prefix="rev_"): 1272 FieldWrapper.__init__(self, subfield, prefix) 1273 self.analyzer = subfield.analyzer | analysis.ReverseTextFilter() 1274 self.format = BasicFormat(lengths=False, weights=False) 1275 1276 self.scorable = False 1277 self.set_sortable(False) 1278 self.stored = False 1279 self.unique = False 1280 self.vector = False 1281 1282 def subfields(self): 1283 yield "", self.subfield 1284 yield self.name_prefix, self 1285 1286 1287# Schema class 1288 1289class MetaSchema(type): 1290 def __new__(cls, name, bases, attrs): 1291 super_new = super(MetaSchema, cls).__new__ 1292 if not any(b for b in bases if isinstance(b, MetaSchema)): 1293 # If this isn't a subclass of MetaSchema, don't do anything special 1294 return super_new(cls, name, bases, attrs) 1295 1296 # Create the class 1297 special_attrs = {} 1298 for key in list(attrs.keys()): 1299 if key.startswith("__"): 1300 special_attrs[key] = attrs.pop(key) 1301 new_class = super_new(cls, name, bases, special_attrs) 1302 1303 fields = {} 1304 for b in bases: 1305 if hasattr(b, "_clsfields"): 1306 fields.update(b._clsfields) 1307 fields.update(attrs) 1308 new_class._clsfields = fields 1309 return new_class 1310 1311 def schema(self): 1312 return Schema(**self._clsfields) 1313 1314 1315class Schema(object): 1316 """ 1317 Represents the collection of fields in an index. Maps field names to 1318 FieldType objects which define the behavior of each field. 1319 1320 Low-level parts of the index use field numbers instead of field names for 1321 compactness. This class has several methods for converting between the 1322 field name, field number, and field object itself. 1323 """ 1324 1325 def __init__(self, **fields): 1326 """ 1327 All keyword arguments to the constructor are treated as fieldname = 1328 fieldtype pairs. The fieldtype can be an instantiated FieldType object, 1329 or a FieldType sub-class (in which case the Schema will instantiate it 1330 with the default constructor before adding it). 1331 1332 For example:: 1333 1334 s = Schema(content = TEXT, 1335 title = TEXT(stored = True), 1336 tags = KEYWORD(stored = True)) 1337 """ 1338 1339 self._fields = {} 1340 self._subfields = {} 1341 self._dyn_fields = {} 1342 1343 for name in sorted(fields.keys()): 1344 self.add(name, fields[name]) 1345 1346 def copy(self): 1347 """ 1348 Returns a shallow copy of the schema. The field instances are not 1349 deep copied, so they are shared between schema copies. 1350 """ 1351 1352 return self.__class__(**self._fields) 1353 1354 def __eq__(self, other): 1355 return (other.__class__ is self.__class__ 1356 and list(self.items()) == list(other.items())) 1357 1358 def __ne__(self, other): 1359 return not(self.__eq__(other)) 1360 1361 def __repr__(self): 1362 return "<%s: %r>" % (self.__class__.__name__, self.names()) 1363 1364 def __iter__(self): 1365 """ 1366 Returns the field objects in this schema. 1367 """ 1368 1369 return iter(self._fields.values()) 1370 1371 def __getitem__(self, name): 1372 """ 1373 Returns the field associated with the given field name. 1374 """ 1375 1376 # If the name is in the dictionary, just return it 1377 if name in self._fields: 1378 return self._fields[name] 1379 1380 # Check if the name matches a dynamic field 1381 for expr, fieldtype in itervalues(self._dyn_fields): 1382 if expr.match(name): 1383 return fieldtype 1384 1385 raise KeyError("No field named %r" % (name,)) 1386 1387 def __len__(self): 1388 """ 1389 Returns the number of fields in this schema. 1390 """ 1391 1392 return len(self._fields) 1393 1394 def __contains__(self, fieldname): 1395 """ 1396 Returns True if a field by the given name is in this schema. 1397 """ 1398 1399 # Defined in terms of __getitem__ so that there's only one method to 1400 # override to provide dynamic fields 1401 try: 1402 field = self[fieldname] 1403 return field is not None 1404 except KeyError: 1405 return False 1406 1407 def __setstate__(self, state): 1408 if "_subfields" not in state: 1409 state["_subfields"] = {} 1410 self.__dict__.update(state) 1411 1412 def to_bytes(self, fieldname, value): 1413 return self[fieldname].to_bytes(value) 1414 1415 def items(self): 1416 """ 1417 Returns a list of ("fieldname", field_object) pairs for the fields 1418 in this schema. 1419 """ 1420 1421 return sorted(self._fields.items()) 1422 1423 def names(self, check_names=None): 1424 """ 1425 Returns a list of the names of the fields in this schema. 1426 1427 :param check_names: (optional) sequence of field names to check 1428 whether the schema accepts them as (dynamic) field names - 1429 acceptable names will also be in the result list. 1430 Note: You may also have static field names in check_names, that 1431 won't create duplicates in the result list. Unsupported names 1432 will not be in the result list. 1433 """ 1434 1435 fieldnames = set(self._fields.keys()) 1436 if check_names is not None: 1437 check_names = set(check_names) - fieldnames 1438 fieldnames.update(fieldname for fieldname in check_names 1439 if fieldname in self) 1440 return sorted(fieldnames) 1441 1442 def clean(self): 1443 for field in self: 1444 field.clean() 1445 1446 def add(self, name, fieldtype, glob=False): 1447 """ 1448 Adds a field to this schema. 1449 1450 :param name: The name of the field. 1451 :param fieldtype: An instantiated fields.FieldType object, or a 1452 FieldType subclass. If you pass an instantiated object, the schema 1453 will use that as the field configuration for this field. If you 1454 pass a FieldType subclass, the schema will automatically 1455 instantiate it with the default constructor. 1456 """ 1457 1458 # If the user passed a type rather than an instantiated field object, 1459 # instantiate it automatically 1460 if type(fieldtype) is type: 1461 try: 1462 fieldtype = fieldtype() 1463 except: 1464 e = sys.exc_info()[1] 1465 raise FieldConfigurationError("Error: %s instantiating field " 1466 "%r: %r" % (e, name, fieldtype)) 1467 1468 if not isinstance(fieldtype, FieldType): 1469 raise FieldConfigurationError("%r is not a FieldType object" 1470 % fieldtype) 1471 1472 self._subfields[name] = sublist = [] 1473 for prefix, subfield in fieldtype.subfields(): 1474 fname = prefix + name 1475 sublist.append(fname) 1476 1477 # Check field name 1478 if fname.startswith("_"): 1479 raise FieldConfigurationError("Names cannot start with _") 1480 elif " " in fname: 1481 raise FieldConfigurationError("Names cannot contain spaces") 1482 elif fname in self._fields or (glob and fname in self._dyn_fields): 1483 raise FieldConfigurationError("%r already in schema" % fname) 1484 1485 # Add the field 1486 if glob: 1487 expr = re.compile(fnmatch.translate(name)) 1488 self._dyn_fields[fname] = (expr, subfield) 1489 else: 1490 fieldtype.on_add(self, fname) 1491 self._fields[fname] = subfield 1492 1493 def remove(self, fieldname): 1494 if fieldname in self._fields: 1495 self._fields[fieldname].on_remove(self, fieldname) 1496 del self._fields[fieldname] 1497 1498 if fieldname in self._subfields: 1499 for subname in self._subfields[fieldname]: 1500 if subname in self._fields: 1501 del self._fields[subname] 1502 del self._subfields[fieldname] 1503 1504 elif fieldname in self._dyn_fields: 1505 del self._dyn_fields[fieldname] 1506 1507 else: 1508 raise KeyError("No field named %r" % fieldname) 1509 1510 def indexable_fields(self, fieldname): 1511 if fieldname in self._subfields: 1512 for subname in self._subfields[fieldname]: 1513 yield subname, self._fields[subname] 1514 else: 1515 # Use __getitem__ here instead of getting it directly from _fields 1516 # because it might be a glob 1517 yield fieldname, self[fieldname] 1518 1519 def has_scorable_fields(self): 1520 return any(ftype.scorable for ftype in self) 1521 1522 def stored_names(self): 1523 """ 1524 Returns a list of the names of fields that are stored. 1525 """ 1526 1527 return [name for name, field in self.items() if field.stored] 1528 1529 def scorable_names(self): 1530 """ 1531 Returns a list of the names of fields that store field 1532 lengths. 1533 """ 1534 1535 return [name for name, field in self.items() if field.scorable] 1536 1537 1538class SchemaClass(with_metaclass(MetaSchema, Schema)): 1539 """ 1540 Allows you to define a schema using declarative syntax, similar to 1541 Django models:: 1542 1543 class MySchema(SchemaClass): 1544 path = ID 1545 date = DATETIME 1546 content = TEXT 1547 1548 You can use inheritance to share common fields between schemas:: 1549 1550 class Parent(SchemaClass): 1551 path = ID(stored=True) 1552 date = DATETIME 1553 1554 class Child1(Parent): 1555 content = TEXT(positions=False) 1556 1557 class Child2(Parent): 1558 tags = KEYWORD 1559 1560 This class overrides ``__new__`` so instantiating your sub-class always 1561 results in an instance of ``Schema``. 1562 1563 >>> class MySchema(SchemaClass): 1564 ... title = TEXT(stored=True) 1565 ... content = TEXT 1566 ... 1567 >>> s = MySchema() 1568 >>> type(s) 1569 <class 'whoosh.fields.Schema'> 1570 1571 """ 1572 1573 def __new__(cls, *args, **kwargs): 1574 obj = super(Schema, cls).__new__(Schema) 1575 kw = getattr(cls, "_clsfields", {}) 1576 kw.update(kwargs) 1577 obj.__init__(*args, **kw) 1578 return obj 1579 1580 1581def ensure_schema(schema): 1582 if isinstance(schema, type) and issubclass(schema, Schema): 1583 schema = schema.schema() 1584 if not isinstance(schema, Schema): 1585 raise FieldConfigurationError("%r is not a Schema" % schema) 1586 return schema 1587 1588 1589def merge_fielddict(d1, d2): 1590 keyset = set(d1.keys()) | set(d2.keys()) 1591 out = {} 1592 for name in keyset: 1593 field1 = d1.get(name) 1594 field2 = d2.get(name) 1595 if field1 and field2 and field1 != field2: 1596 raise Exception("Inconsistent field %r: %r != %r" 1597 % (name, field1, field2)) 1598 out[name] = field1 or field2 1599 return out 1600 1601 1602def merge_schema(s1, s2): 1603 schema = Schema() 1604 schema._fields = merge_fielddict(s1._fields, s2._fields) 1605 schema._dyn_fields = merge_fielddict(s1._dyn_fields, s2._dyn_fields) 1606 return schema 1607 1608 1609def merge_schemas(schemas): 1610 schema = schemas[0] 1611 for i in xrange(1, len(schemas)): 1612 schema = merge_schema(schema, schemas[i]) 1613 return schema 1614