1# Copyright 2007 Matt Chaput. All rights reserved.
2#
3# Redistribution and use in source and binary forms, with or without
4# modification, are permitted provided that the following conditions are met:
5#
6#    1. Redistributions of source code must retain the above copyright notice,
7#       this list of conditions and the following disclaimer.
8#
9#    2. Redistributions in binary form must reproduce the above copyright
10#       notice, this list of conditions and the following disclaimer in the
11#       documentation and/or other materials provided with the distribution.
12#
13# THIS SOFTWARE IS PROVIDED BY MATT CHAPUT ``AS IS'' AND ANY EXPRESS OR
14# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF
15# MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO
16# EVENT SHALL MATT CHAPUT OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT,
17# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
18# LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA,
19# OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
20# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
21# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE,
22# EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
23#
24# The views and conclusions contained in the software and documentation are
25# those of the authors and should not be interpreted as representing official
26# policies, either expressed or implied, of Matt Chaput.
27
28"""
29 Contains functions and classes related to fields.
30"""
31
32import datetime, fnmatch, re, struct, sys
33from array import array
34from decimal import Decimal
35
36from whoosh import analysis, columns, formats
37from whoosh.compat import with_metaclass
38from whoosh.compat import itervalues, xrange
39from whoosh.compat import bytes_type, string_type, text_type
40from whoosh.system import emptybytes
41from whoosh.system import pack_byte, unpack_byte
42from whoosh.util.numeric import to_sortable, from_sortable
43from whoosh.util.numeric import typecode_max, NaN
44from whoosh.util.text import utf8encode, utf8decode
45from whoosh.util.times import datetime_to_long, long_to_datetime
46
47
48# Exceptions
49
50class FieldConfigurationError(Exception):
51    pass
52
53
54class UnknownFieldError(Exception):
55    pass
56
57
58# Field Types
59
60class FieldType(object):
61    """
62    Represents a field configuration.
63
64    The FieldType object supports the following attributes:
65
66    * format (formats.Format): the storage format for posting blocks.
67
68    * analyzer (analysis.Analyzer): the analyzer to use to turn text into
69      terms.
70
71    * scorable (boolean): whether searches against this field may be scored.
72      This controls whether the index stores per-document field lengths for
73      this field.
74
75    * stored (boolean): whether the content of this field is stored for each
76      document. For example, in addition to indexing the title of a document,
77      you usually want to store the title so it can be presented as part of
78      the search results.
79
80    * unique (boolean): whether this field's value is unique to each document.
81      For example, 'path' or 'ID'. IndexWriter.update_document() will use
82      fields marked as 'unique' to find the previous version of a document
83      being updated.
84
85    * multitoken_query is a string indicating what kind of query to use when
86      a "word" in a user query parses into multiple tokens. The string is
87      interpreted by the query parser. The strings understood by the default
88      query parser are "first" (use first token only), "and" (join the tokens
89      with an AND query), "or" (join the tokens with OR), "phrase" (join
90      the tokens with a phrase query), and "default" (use the query parser's
91      default join type).
92
93    * vector (formats.Format or boolean): the format to use to store term
94        vectors. If not a ``Format`` object, any true value means to use the
95        index format as the term vector format. Any flase value means don't
96        store term vectors for this field.
97
98    The constructor for the base field type simply lets you supply your own
99    attribute values.  Subclasses may configure some or all of this for you.
100    """
101
102    analyzer = format = scorable = stored = unique = vector = None
103    indexed = True
104    multitoken_query = "default"
105    sortable_typecode = None
106    column_type = None
107
108    def __init__(self, format, analyzer, scorable=False,
109                 stored=False, unique=False, multitoken_query="default",
110                 sortable=False, vector=None):
111        self.format = format
112        self.analyzer = analyzer
113        self.scorable = scorable
114        self.stored = stored
115        self.unique = unique
116        self.multitoken_query = multitoken_query
117        self.set_sortable(sortable)
118
119        if isinstance(vector, formats.Format):
120            self.vector = vector
121        elif vector:
122            self.vector = self.format
123        else:
124            self.vector = None
125
126    def __repr__(self):
127        return ("%s(format=%r, scorable=%s, stored=%s, unique=%s)"
128                % (self.__class__.__name__, self.format, self.scorable,
129                   self.stored, self.unique))
130
131    def __eq__(self, other):
132        return all((isinstance(other, FieldType),
133                    (self.format == other.format),
134                    (self.scorable == other.scorable),
135                    (self.stored == other.stored),
136                    (self.unique == other.unique),
137                    (self.column_type == other.column_type)))
138
139    def __ne__(self, other):
140        return not(self.__eq__(other))
141
142    # Text
143
144    def index(self, value, **kwargs):
145        """Returns an iterator of (btext, frequency, weight, encoded_value)
146        tuples for each unique word in the input value.
147
148        The default implementation uses the ``analyzer`` attribute to tokenize
149        the value into strings, then encodes them into bytes using UTF-8.
150        """
151
152        if not self.format:
153            raise Exception("%s field %r cannot index without a format"
154                            % (self.__class__.__name__, self))
155        if not isinstance(value, (text_type, list, tuple)):
156            raise ValueError("%r is not unicode or sequence" % value)
157        assert isinstance(self.format, formats.Format)
158
159        if "mode" not in kwargs:
160            kwargs["mode"] = "index"
161
162        word_values = self.format.word_values
163        ana = self.analyzer
164        for tstring, freq, wt, vbytes in word_values(value, ana, **kwargs):
165            yield (utf8encode(tstring)[0], freq, wt, vbytes)
166
167    def tokenize(self, value, **kwargs):
168        """
169        Analyzes the given string and returns an iterator of Token objects
170        (note: for performance reasons, actually the same token yielded over
171        and over with different attributes).
172        """
173
174        if not self.analyzer:
175            raise Exception("%s field has no analyzer" % self.__class__)
176        return self.analyzer(value, **kwargs)
177
178    def process_text(self, qstring, mode='', **kwargs):
179        """
180        Analyzes the given string and returns an iterator of token texts.
181
182        >>> field = fields.TEXT()
183        >>> list(field.process_text("The ides of March"))
184        ["ides", "march"]
185        """
186
187        if not self.format:
188            raise Exception("%s field has no format" % self)
189        return (t.text for t in self.tokenize(qstring, mode=mode, **kwargs))
190
191    # Conversion
192
193    def to_bytes(self, value):
194        """
195        Returns a bytes representation of the given value, appropriate to be
196        written to disk. The default implementation assumes a unicode value and
197        encodes it using UTF-8.
198        """
199
200        if isinstance(value, (list, tuple)):
201            value = value[0]
202        if not isinstance(value, bytes_type):
203            value = utf8encode(value)[0]
204        return value
205
206    def to_column_value(self, value):
207        """
208        Returns an object suitable to be inserted into the document values
209        column for this field. The default implementation simply calls
210        ``self.to_bytes(value)``.
211        """
212
213        return self.to_bytes(value)
214
215    def from_bytes(self, bs):
216        return utf8decode(bs)[0]
217
218    def from_column_value(self, value):
219        return self.from_bytes(value)
220
221    # Columns/sorting
222
223    def set_sortable(self, sortable):
224        if sortable:
225            if isinstance(sortable, columns.Column):
226                self.column_type = sortable
227            else:
228                self.column_type = self.default_column()
229        else:
230            self.column_type = None
231
232    def sortable_terms(self, ixreader, fieldname):
233        """
234        Returns an iterator of the "sortable" tokens in the given reader and
235        field. These values can be used for sorting. The default implementation
236        simply returns all tokens in the field.
237
238        This can be overridden by field types such as NUMERIC where some values
239        in a field are not useful for sorting.
240        """
241
242        return ixreader.lexicon(fieldname)
243
244    def default_column(self):
245        return columns.VarBytesColumn()
246
247    # Parsing
248
249    def self_parsing(self):
250        """
251        Subclasses should override this method to return True if they want
252        the query parser to call the field's ``parse_query()`` method instead
253        of running the analyzer on text in this field. This is useful where
254        the field needs full control over how queries are interpreted, such
255        as in the numeric field type.
256        """
257
258        return False
259
260    def parse_query(self, fieldname, qstring, boost=1.0):
261        """
262        When ``self_parsing()`` returns True, the query parser will call
263        this method to parse basic query text.
264        """
265
266        raise NotImplementedError(self.__class__.__name__)
267
268    def parse_range(self, fieldname, start, end, startexcl, endexcl,
269                    boost=1.0):
270        """
271        When ``self_parsing()`` returns True, the query parser will call
272        this method to parse range query text. If this method returns None
273        instead of a query object, the parser will fall back to parsing the
274        start and end terms using process_text().
275        """
276
277        return None
278
279    # Spelling
280
281    def separate_spelling(self):
282        """
283        Returns True if the field stores unstemmed words in a separate field for
284        spelling suggestions.
285        """
286
287        return False
288
289    def spelling_fieldname(self, fieldname):
290        """
291        Returns the name of a field to use for spelling suggestions instead of
292        this field.
293
294        :param fieldname: the name of this field.
295        """
296
297        return fieldname
298
299    def spellable_words(self, value):
300        """Returns an iterator of each unique word (in sorted order) in the
301        input value, suitable for inclusion in the field's word graph.
302
303        The default behavior is to call the field analyzer with the keyword
304        argument ``no_morph=True``, which should make the analyzer skip any
305        morphological transformation filters (e.g. stemming) to preserve the
306        original form of the words. Exotic field types may need to override
307        this behavior.
308        """
309
310        if isinstance(value, (list, tuple)):
311            words = value
312        else:
313            words = [token.text for token
314                     in self.analyzer(value, no_morph=True)]
315
316        return iter(sorted(set(words)))
317
318    # Utility
319
320    def subfields(self):
321        """
322        Returns an iterator of ``(name_prefix, fieldobject)`` pairs for the
323        fields that need to be indexed when content is put in this field. The
324        default implementation simply yields ``("", self)``.
325        """
326
327        yield "", self
328
329    def supports(self, name):
330        """
331        Returns True if the underlying format supports the given posting
332        value type.
333
334        >>> field = TEXT()
335        >>> field.supports("positions")
336        True
337        >>> field.supports("chars")
338        False
339        """
340
341        return self.format.supports(name)
342
343    def clean(self):
344        """
345        Clears any cached information in the field and any child objects.
346        """
347
348        if self.format and hasattr(self.format, "clean"):
349            self.format.clean()
350
351    # Events
352
353    def on_add(self, schema, fieldname):
354        pass
355
356    def on_remove(self, schema, fieldname):
357        pass
358
359
360# Wrapper base class
361
362class FieldWrapper(FieldType):
363    def __init__(self, subfield, prefix):
364        if isinstance(subfield, type):
365            subfield = subfield()
366        self.subfield = subfield
367        self.name_prefix = prefix
368
369        # By default we'll copy all the subfield's attributes -- override these
370        # in subclass constructor for things you want to change
371        self.analyzer = subfield.analyzer
372        self.format = subfield.format
373        self.column_type = subfield.column_type
374        self.scorable = subfield.scorable
375        self.stored = subfield.stored
376        self.unique = subfield.unique
377        self.indexed = subfield.indexed
378        self.vector = subfield.vector
379
380    def __eq__(self, other):
381        return self.subfield.__eq__(other)
382
383    def __ne__(self, other):
384        return self.subfield.__ne__(other)
385
386    # Text
387
388    # def index(self, value, boost=1.0, **kwargs):
389    #     return self.subfield.index(value, boost, **kwargs)
390    #
391    # def tokenize(self, value, **kwargs):
392    #     return self.subfield.tokenize(value, **kwargs)
393    #
394    # def process_text(self, qstring, mode='', **kwargs):
395    #     return self.subfield.process_text(qstring, mode, **kwargs)
396
397    # Conversion
398
399    def to_bytes(self, value):
400        return self.subfield.to_bytes(value)
401
402    def to_column_value(self, value):
403        return self.subfield.to_column_value(value)
404
405    def from_bytes(self, bs):
406        return self.subfield.from_bytes(bs)
407
408    def from_column_value(self, value):
409        return self.subfield.from_column_value(value)
410
411    # Sorting/columns
412
413    def set_sortable(self, sortable):
414        self.subfield.set_sortable(sortable)
415
416    def sortable_terms(self, ixreader, fieldname):
417        return self.subfield.sortable_terms(ixreader, fieldname)
418
419    def default_column(self):
420        return self.subfield.default_column()
421
422    # Parsing
423
424    def self_parsing(self):
425        return self.subfield.self_parsing()
426
427    def parse_query(self, fieldname, qstring, boost=1.0):
428        return self.subfield.parse_query(fieldname, qstring, boost)
429
430    def parse_range(self, fieldname, start, end, startexcl, endexcl, boost=1.0):
431        self.subfield.parse_range(fieldname, start, end, startexcl, endexcl,
432                                  boost)
433
434    # Utility
435
436    def subfields(self):
437        # The default FieldWrapper.subfields() implementation DOES NOT split
438        # out the subfield here -- you need to override if that's what you want
439        yield "", self
440
441    def supports(self, name):
442        return self.subfield.supports(name)
443
444    def clean(self):
445        self.subfield.clean()
446
447    # Events
448
449    def on_add(self, schema, fieldname):
450        self.subfield.on_add(schema, fieldname)
451
452    def on_remove(self, schema, fieldname):
453        self.subfield.on_remove(schema, fieldname)
454
455
456# Pre-configured field types
457
458class ID(FieldType):
459    """
460    Configured field type that indexes the entire value of the field as one
461    token. This is useful for data you don't want to tokenize, such as the path
462    of a file.
463    """
464
465    def __init__(self, stored=False, unique=False, field_boost=1.0,
466                 sortable=False, analyzer=None):
467        """
468        :param stored: Whether the value of this field is stored with the
469            document.
470        """
471
472        self.analyzer = analyzer or analysis.IDAnalyzer()
473        # Don't store any information other than the doc ID
474        self.format = formats.Existence(field_boost=field_boost)
475        self.stored = stored
476        self.unique = unique
477        self.set_sortable(sortable)
478
479
480class IDLIST(FieldType):
481    """
482    Configured field type for fields containing IDs separated by whitespace
483    and/or punctuation (or anything else, using the expression param).
484    """
485
486    def __init__(self, stored=False, unique=False, expression=None,
487                 field_boost=1.0):
488        """
489        :param stored: Whether the value of this field is stored with the
490            document.
491        :param unique: Whether the value of this field is unique per-document.
492        :param expression: The regular expression object to use to extract
493            tokens. The default expression breaks tokens on CRs, LFs, tabs,
494            spaces, commas, and semicolons.
495        """
496
497        expression = expression or re.compile(r"[^\r\n\t ,;]+")
498        self.analyzer = analysis.RegexAnalyzer(expression=expression)
499        # Don't store any information other than the doc ID
500        self.format = formats.Existence(field_boost=field_boost)
501        self.stored = stored
502        self.unique = unique
503
504
505class NUMERIC(FieldType):
506    """
507    Special field type that lets you index integer or floating point
508    numbers in relatively short fixed-width terms. The field converts numbers
509    to sortable bytes for you before indexing.
510
511    You specify the numeric type of the field (``int`` or ``float``) when you
512    create the ``NUMERIC`` object. The default is ``int``. For ``int``, you can
513    specify a size in bits (``32`` or ``64``). For both ``int`` and ``float``
514    you can specify a ``signed`` keyword argument (default is ``True``).
515
516    >>> schema = Schema(path=STORED, position=NUMERIC(int, 64, signed=False))
517    >>> ix = storage.create_index(schema)
518    >>> with ix.writer() as w:
519    ...     w.add_document(path="/a", position=5820402204)
520    ...
521
522    You can also use the NUMERIC field to store Decimal instances by specifying
523    a type of ``int`` or ``long`` and the ``decimal_places`` keyword argument.
524    This simply multiplies each number by ``(10 ** decimal_places)`` before
525    storing it as an integer. Of course this may throw away decimal prcesision
526    (by truncating, not rounding) and imposes the same maximum value limits as
527    ``int``/``long``, but these may be acceptable for certain applications.
528
529    >>> from decimal import Decimal
530    >>> schema = Schema(path=STORED, position=NUMERIC(int, decimal_places=4))
531    >>> ix = storage.create_index(schema)
532    >>> with ix.writer() as w:
533    ...     w.add_document(path="/a", position=Decimal("123.45")
534    ...
535
536    """
537
538    def __init__(self, numtype=int, bits=32, stored=False, unique=False,
539                 field_boost=1.0, decimal_places=0, shift_step=4, signed=True,
540                 sortable=False, default=None):
541        """
542        :param numtype: the type of numbers that can be stored in this field,
543            either ``int``, ``float``. If you use ``Decimal``,
544            use the ``decimal_places`` argument to control how many decimal
545            places the field will store.
546        :param bits: When ``numtype`` is ``int``, the number of bits to use to
547            store the number: 8, 16, 32, or 64.
548        :param stored: Whether the value of this field is stored with the
549            document.
550        :param unique: Whether the value of this field is unique per-document.
551        :param decimal_places: specifies the number of decimal places to save
552            when storing Decimal instances. If you set this, you will always
553            get Decimal instances back from the field.
554        :param shift_steps: The number of bits of precision to shift away at
555            each tiered indexing level. Values should generally be 1-8. Lower
556            values yield faster searches but take up more space. A value
557            of `0` means no tiered indexing.
558        :param signed: Whether the numbers stored in this field may be
559            negative.
560        """
561
562        # Allow users to specify strings instead of Python types in case
563        # docstring isn't clear
564        if numtype == "int":
565            numtype = int
566        if numtype == "float":
567            numtype = float
568        # Raise an error if the user tries to use a type other than int or
569        # float
570        if numtype is Decimal:
571            numtype = int
572            if not decimal_places:
573                raise TypeError("To store Decimal instances, you must set the "
574                                "decimal_places argument")
575        elif numtype not in (int, float):
576            raise TypeError("Can't use %r as a type, use int or float"
577                            % numtype)
578        # Sanity check
579        if numtype is float and decimal_places:
580            raise Exception("A float type and decimal_places argument %r are "
581                            "incompatible" % decimal_places)
582
583        intsizes = [8, 16, 32, 64]
584        intcodes = ["B", "H", "I", "Q"]
585        # Set up field configuration based on type and size
586        if numtype is float:
587            bits = 64  # Floats are converted to 64 bit ints
588        else:
589            if bits not in intsizes:
590                raise Exception("Invalid bits %r, use 8, 16, 32, or 64"
591                                % bits)
592        # Type code for the *sortable* representation
593        self.sortable_typecode = intcodes[intsizes.index(bits)]
594        self._struct = struct.Struct(">" + str(self.sortable_typecode))
595
596        self.numtype = numtype
597        self.bits = bits
598        self.stored = stored
599        self.unique = unique
600        self.decimal_places = decimal_places
601        self.shift_step = shift_step
602        self.signed = signed
603        self.analyzer = analysis.IDAnalyzer()
604        # Don't store any information other than the doc ID
605        self.format = formats.Existence(field_boost=field_boost)
606        self.min_value, self.max_value = self._min_max()
607
608        # Column configuration
609        if default is None:
610            if numtype is int:
611                default = typecode_max[self.sortable_typecode]
612            else:
613                default = NaN
614        elif not self.is_valid(default):
615            raise Exception("The default %r is not a valid number for this "
616                            "field" % default)
617
618        self.default = default
619        self.set_sortable(sortable)
620
621    def __getstate__(self):
622        d = self.__dict__.copy()
623        if "_struct" in d:
624            del d["_struct"]
625        return d
626
627    def __setstate__(self, d):
628        self.__dict__.update(d)
629        self._struct = struct.Struct(">" + str(self.sortable_typecode))
630        if "min_value" not in d:
631            d["min_value"], d["max_value"] = self._min_max()
632
633    def _min_max(self):
634        numtype = self.numtype
635        bits = self.bits
636        signed = self.signed
637
638        # Calculate the minimum and maximum possible values for error checking
639        min_value = from_sortable(numtype, bits, signed, 0)
640        max_value = from_sortable(numtype, bits, signed, 2 ** bits - 1)
641
642        return min_value, max_value
643
644    def default_column(self):
645        return columns.NumericColumn(self.sortable_typecode,
646                                     default=self.default)
647
648    def is_valid(self, x):
649        try:
650            x = self.to_bytes(x)
651        except ValueError:
652            return False
653        except OverflowError:
654            return False
655
656        return True
657
658    def index(self, num, **kwargs):
659        # If the user gave us a list of numbers, recurse on the list
660        if isinstance(num, (list, tuple)):
661            for n in num:
662                for item in self.index(n):
663                    yield item
664            return
665
666        # word, freq, weight, valuestring
667        if self.shift_step:
668            for shift in xrange(0, self.bits, self.shift_step):
669                yield (self.to_bytes(num, shift), 1, 1.0, emptybytes)
670        else:
671            yield (self.to_bytes(num), 1, 1.0, emptybytes)
672
673    def prepare_number(self, x):
674        if x == emptybytes or x is None:
675            return x
676
677        dc = self.decimal_places
678        if dc and isinstance(x, (string_type, Decimal)):
679            x = Decimal(x) * (10 ** dc)
680        elif isinstance(x, Decimal):
681            raise TypeError("Can't index a Decimal object unless you specified "
682                            "decimal_places on the field")
683
684        try:
685            x = self.numtype(x)
686        except OverflowError:
687            raise ValueError("Value %r overflowed number type %r"
688                             % (x, self.numtype))
689
690        if x < self.min_value or x > self.max_value:
691            raise ValueError("Numeric field value %s out of range [%s, %s]"
692                             % (x, self.min_value, self.max_value))
693        return x
694
695    def unprepare_number(self, x):
696        dc = self.decimal_places
697        if dc:
698            s = str(x)
699            x = Decimal(s[:-dc] + "." + s[-dc:])
700        return x
701
702    def to_column_value(self, x):
703        if isinstance(x, (list, tuple, array)):
704            x = x[0]
705        x = self.prepare_number(x)
706        return to_sortable(self.numtype, self.bits, self.signed, x)
707
708    def from_column_value(self, x):
709        x = from_sortable(self.numtype, self.bits, self.signed, x)
710        return self.unprepare_number(x)
711
712    def to_bytes(self, x, shift=0):
713        # Try to avoid re-encoding; this sucks because on Python 2 we can't
714        # tell the difference between a string and encoded bytes, so we have
715        # to require the user use unicode when they mean string
716        if isinstance(x, bytes_type):
717            return x
718
719        if x == emptybytes or x is None:
720            return self.sortable_to_bytes(0)
721
722        x = self.prepare_number(x)
723        x = to_sortable(self.numtype, self.bits, self.signed, x)
724        return self.sortable_to_bytes(x, shift)
725
726    def sortable_to_bytes(self, x, shift=0):
727        if shift:
728            x >>= shift
729        return pack_byte(shift) + self._struct.pack(x)
730
731    def from_bytes(self, bs):
732        x = self._struct.unpack(bs[1:])[0]
733        x = from_sortable(self.numtype, self.bits, self.signed, x)
734        x = self.unprepare_number(x)
735        return x
736
737    def process_text(self, text, **kwargs):
738        return (self.to_bytes(text),)
739
740    def self_parsing(self):
741        return True
742
743    def parse_query(self, fieldname, qstring, boost=1.0):
744        from whoosh import query
745        from whoosh.qparser.common import QueryParserError
746
747        if qstring == "*":
748            return query.Every(fieldname, boost=boost)
749
750        if not self.is_valid(qstring):
751            raise QueryParserError("%r is not a valid number" % qstring)
752
753        token = self.to_bytes(qstring)
754        return query.Term(fieldname, token, boost=boost)
755
756    def parse_range(self, fieldname, start, end, startexcl, endexcl,
757                    boost=1.0):
758        from whoosh import query
759        from whoosh.qparser.common import QueryParserError
760
761        if start is not None:
762            if not self.is_valid(start):
763                raise QueryParserError("Range start %r is not a valid number"
764                                       % start)
765            start = self.prepare_number(start)
766        if end is not None:
767            if not self.is_valid(end):
768                raise QueryParserError("Range end %r is not a valid number"
769                                       % end)
770            end = self.prepare_number(end)
771        return query.NumericRange(fieldname, start, end, startexcl, endexcl,
772                                  boost=boost)
773
774    def sortable_terms(self, ixreader, fieldname):
775        zero = b"\x00"
776        for token in ixreader.lexicon(fieldname):
777            if token[0:1] != zero:
778                # Only yield the full-precision values
779                break
780            yield token
781
782
783class DATETIME(NUMERIC):
784    """
785    Special field type that lets you index datetime objects. The field
786    converts the datetime objects to sortable text for you before indexing.
787
788    Since this field is based on Python's datetime module it shares all the
789    limitations of that module, such as the inability to represent dates before
790    year 1 in the proleptic Gregorian calendar. However, since this field
791    stores datetimes as an integer number of microseconds, it could easily
792    represent a much wider range of dates if the Python datetime implementation
793    ever supports them.
794
795    >>> schema = Schema(path=STORED, date=DATETIME)
796    >>> ix = storage.create_index(schema)
797    >>> w = ix.writer()
798    >>> w.add_document(path="/a", date=datetime.now())
799    >>> w.commit()
800    """
801
802    def __init__(self, stored=False, unique=False, sortable=False):
803        """
804        :param stored: Whether the value of this field is stored with the
805            document.
806        :param unique: Whether the value of this field is unique per-document.
807        """
808
809        super(DATETIME, self).__init__(int, 64, stored=stored,
810                                       unique=unique, shift_step=8,
811                                       sortable=sortable)
812
813    def prepare_datetime(self, x):
814        from whoosh.util.times import floor
815
816        if isinstance(x, text_type):
817            # For indexing, support same strings as for query parsing --
818            # convert unicode to datetime object
819            x = self._parse_datestring(x)
820            x = floor(x)  # this makes most sense (unspecified = lowest)
821
822        if isinstance(x, datetime.datetime):
823            return datetime_to_long(x)
824        elif isinstance(x, bytes_type):
825            return x
826        else:
827            raise Exception("%r is not a datetime" % (x,))
828
829    def to_column_value(self, x):
830        if isinstance(x, bytes_type):
831            raise Exception("%r is not a datetime" % (x,))
832        if isinstance(x, (list, tuple)):
833            x = x[0]
834        return self.prepare_datetime(x)
835
836    def from_column_value(self, x):
837        return long_to_datetime(x)
838
839    def to_bytes(self, x, shift=0):
840        x = self.prepare_datetime(x)
841        return NUMERIC.to_bytes(self, x, shift=shift)
842
843    def from_bytes(self, bs):
844        x = NUMERIC.from_bytes(self, bs)
845        return long_to_datetime(x)
846
847    def _parse_datestring(self, qstring):
848        # This method parses a very simple datetime representation of the form
849        # YYYY[MM[DD[hh[mm[ss[uuuuuu]]]]]]
850        from whoosh.util.times import adatetime, fix, is_void
851
852        qstring = qstring.replace(" ", "").replace("-", "").replace(".", "")
853        year = month = day = hour = minute = second = microsecond = None
854        if len(qstring) >= 4:
855            year = int(qstring[:4])
856        if len(qstring) >= 6:
857            month = int(qstring[4:6])
858        if len(qstring) >= 8:
859            day = int(qstring[6:8])
860        if len(qstring) >= 10:
861            hour = int(qstring[8:10])
862        if len(qstring) >= 12:
863            minute = int(qstring[10:12])
864        if len(qstring) >= 14:
865            second = int(qstring[12:14])
866        if len(qstring) == 20:
867            microsecond = int(qstring[14:])
868
869        at = fix(adatetime(year, month, day, hour, minute, second,
870                           microsecond))
871        if is_void(at):
872            raise Exception("%r is not a parseable date" % qstring)
873        return at
874
875    def parse_query(self, fieldname, qstring, boost=1.0):
876        from whoosh import query
877        from whoosh.util.times import is_ambiguous
878
879        try:
880            at = self._parse_datestring(qstring)
881        except:
882            e = sys.exc_info()[1]
883            return query.error_query(e)
884
885        if is_ambiguous(at):
886            startnum = datetime_to_long(at.floor())
887            endnum = datetime_to_long(at.ceil())
888            return query.NumericRange(fieldname, startnum, endnum)
889        else:
890            return query.Term(fieldname, at, boost=boost)
891
892    def parse_range(self, fieldname, start, end, startexcl, endexcl,
893                    boost=1.0):
894        from whoosh import query
895
896        if start is None and end is None:
897            return query.Every(fieldname, boost=boost)
898
899        if start is not None:
900            startdt = self._parse_datestring(start).floor()
901            start = datetime_to_long(startdt)
902
903        if end is not None:
904            enddt = self._parse_datestring(end).ceil()
905            end = datetime_to_long(enddt)
906
907        return query.NumericRange(fieldname, start, end, boost=boost)
908
909
910class BOOLEAN(FieldType):
911    """
912    Special field type that lets you index boolean values (True and False).
913    The field converts the boolean values to text for you before indexing.
914
915    >>> schema = Schema(path=STORED, done=BOOLEAN)
916    >>> ix = storage.create_index(schema)
917    >>> w = ix.writer()
918    >>> w.add_document(path="/a", done=False)
919    >>> w.commit()
920    """
921
922    bytestrings = (b"f", b"t")
923    trues = frozenset(u"t true yes 1".split())
924    falses = frozenset(u"f false no 0".split())
925
926    def __init__(self, stored=False, field_boost=1.0):
927        """
928        :param stored: Whether the value of this field is stored with the
929            document.
930        """
931
932        self.stored = stored
933        # Don't store any information other than the doc ID
934        self.format = formats.Existence(field_boost=field_boost)
935
936    def _obj_to_bool(self, x):
937        # We special case strings such as "true", "false", "yes", "no", but
938        # otherwise call bool() on the query value. This lets you pass objects
939        # as query values and do the right thing.
940
941        if isinstance(x, string_type) and x.lower() in self.trues:
942            x = True
943        elif isinstance(x, string_type) and x.lower() in self.falses:
944            x = False
945        else:
946            x = bool(x)
947        return x
948
949    def to_bytes(self, x):
950        if isinstance(x, bytes_type):
951            return x
952        elif isinstance(x, string_type):
953            x = x.lower() in self.trues
954        else:
955            x = bool(x)
956        bs = self.bytestrings[int(x)]
957        return bs
958
959    def index(self, bit, **kwargs):
960        if isinstance(bit, string_type):
961            bit = bit.lower() in self.trues
962        else:
963            bit = bool(bit)
964        # word, freq, weight, valuestring
965        return [(self.bytestrings[int(bit)], 1, 1.0, emptybytes)]
966
967    def self_parsing(self):
968        return True
969
970    def parse_query(self, fieldname, qstring, boost=1.0):
971        from whoosh import query
972
973        if qstring == "*":
974            return query.Every(fieldname, boost=boost)
975
976        return query.Term(fieldname, self._obj_to_bool(qstring), boost=boost)
977
978
979class STORED(FieldType):
980    """
981    Configured field type for fields you want to store but not index.
982    """
983
984    indexed = False
985    stored = True
986
987    def __init__(self):
988        pass
989
990
991class COLUMN(FieldType):
992    """
993    Configured field type for fields you want to store as a per-document
994    value column but not index.
995    """
996
997    indexed = False
998    stored = False
999
1000    def __init__(self, columnobj=None):
1001        if columnobj is None:
1002            columnobj = columns.VarBytesColumn()
1003        if not isinstance(columnobj, columns.Column):
1004            raise TypeError("%r is not a column object" % (columnobj,))
1005        self.column_type = columnobj
1006
1007    def to_bytes(self, v):
1008        return v
1009
1010    def from_bytes(self, b):
1011        return b
1012
1013
1014class KEYWORD(FieldType):
1015    """
1016    Configured field type for fields containing space-separated or
1017    comma-separated keyword-like data (such as tags). The default is to not
1018    store positional information (so phrase searching is not allowed in this
1019    field) and to not make the field scorable.
1020    """
1021
1022    def __init__(self, stored=False, lowercase=False, commas=False,
1023                 scorable=False, unique=False, field_boost=1.0, sortable=False,
1024                 vector=None, analyzer=None):
1025        """
1026        :param stored: Whether to store the value of the field with the
1027            document.
1028        :param commas: Whether this is a comma-separated field. If this is False
1029            (the default), it is treated as a space-separated field.
1030        :param scorable: Whether this field is scorable.
1031        """
1032
1033        if not analyzer:
1034            analyzer = analysis.KeywordAnalyzer(lowercase=lowercase,
1035                                                commas=commas)
1036        self.analyzer = analyzer
1037
1038        # Store field lengths and weights along with doc ID
1039        self.format = formats.Frequency(field_boost=field_boost)
1040        self.scorable = scorable
1041        self.stored = stored
1042        self.unique = unique
1043
1044        if isinstance(vector, formats.Format):
1045            self.vector = vector
1046        elif vector:
1047            self.vector = self.format
1048        else:
1049            self.vector = None
1050
1051        if sortable:
1052            self.column_type = self.default_column()
1053
1054
1055class TEXT(FieldType):
1056    """
1057    Configured field type for text fields (for example, the body text of an
1058    article). The default is to store positional information to allow phrase
1059    searching. This field type is always scorable.
1060    """
1061
1062    def __init__(self, analyzer=None, phrase=True, chars=False, stored=False,
1063                 field_boost=1.0, multitoken_query="default", spelling=False,
1064                 sortable=False, lang=None, vector=None,
1065                 spelling_prefix="spell_"):
1066        """
1067        :param analyzer: The analysis.Analyzer to use to index the field
1068            contents. See the analysis module for more information. If you omit
1069            this argument, the field uses analysis.StandardAnalyzer.
1070        :param phrase: Whether the store positional information to allow phrase
1071            searching.
1072        :param chars: Whether to store character ranges along with positions.
1073            If this is True, "phrase" is also implied.
1074        :param stored: Whether to store the value of this field with the
1075            document. Since this field type generally contains a lot of text,
1076            you should avoid storing it with the document unless you need to,
1077            for example to allow fast excerpts in the search results.
1078        :param spelling: if True, and if the field's analyzer changes the form
1079            of term text (such as a stemming analyzer), this field will store
1080            extra information in a separate field (named using the
1081            ``spelling_prefix`` keyword argument) to allow spelling suggestions
1082            to use the unchanged word forms as spelling suggestions.
1083        :param sortable: If True, make this field sortable using the default
1084            column type. If you pass a :class:`whoosh.columns.Column` instance
1085            instead of True, the field will use the given column type.
1086        :param lang: automaticaly configure a
1087            :class:`whoosh.analysis.LanguageAnalyzer` for the given language.
1088            This is ignored if you also specify an ``analyzer``.
1089        :param vector: if this value evaluates to true, store a list of the
1090            terms in this field in each document. If the value is an instance
1091            of :class:`whoosh.formats.Format`, the index will use the object to
1092            store the term vector. Any other true value (e.g. ``vector=True``)
1093            will use the field's index format to store the term vector as well.
1094        """
1095
1096        if analyzer:
1097            self.analyzer = analyzer
1098        elif lang:
1099            self.analyzer = analysis.LanguageAnalyzer(lang)
1100        else:
1101            self.analyzer = analysis.StandardAnalyzer()
1102
1103        if chars:
1104            formatclass = formats.Characters
1105        elif phrase:
1106            formatclass = formats.Positions
1107        else:
1108            formatclass = formats.Frequency
1109        self.format = formatclass(field_boost=field_boost)
1110
1111        if sortable:
1112            if isinstance(sortable, columns.Column):
1113                self.column_type = sortable
1114            else:
1115                self.column_type = columns.VarBytesColumn()
1116        else:
1117            self.column_type = None
1118
1119        self.spelling = spelling
1120        self.spelling_prefix = spelling_prefix
1121        self.multitoken_query = multitoken_query
1122        self.scorable = True
1123        self.stored = stored
1124
1125        if isinstance(vector, formats.Format):
1126            self.vector = vector
1127        elif vector:
1128            self.vector = self.format
1129        else:
1130            self.vector = None
1131
1132    def subfields(self):
1133        yield "", self
1134
1135        # If the user indicated this is a spellable field, and the analyzer
1136        # is morphic, then also index into a spelling-only field that stores
1137        # minimal information
1138        if self.separate_spelling():
1139            yield self.spelling_prefix, SpellField(self.analyzer)
1140
1141    def separate_spelling(self):
1142        return self.spelling and self.analyzer.has_morph()
1143
1144    def spelling_fieldname(self, fieldname):
1145        if self.separate_spelling():
1146            return self.spelling_prefix + fieldname
1147        else:
1148            return fieldname
1149
1150
1151class SpellField(FieldType):
1152    """
1153    This is a utility field type meant to be returned by ``TEXT.subfields()``
1154    when it needs a minimal field to store the spellable words.
1155    """
1156
1157    def __init__(self, analyzer):
1158        self.format = formats.Frequency()
1159        self.analyzer = analyzer
1160        self.column_type = None
1161        self.scorabe = False
1162        self.stored = False
1163        self.unique = False
1164        self.indexed = True
1165        self.spelling = False
1166
1167    # All the text analysis methods add "nomorph" to the keywords to get
1168    # unmorphed term texts
1169
1170    def index(self, value, boost=1.0, **kwargs):
1171        kwargs["nomorph"] = True
1172        return FieldType.index(self, value, boost=boost, **kwargs)
1173
1174    def tokenzie(self, value, **kwargs):
1175        kwargs["nomorph"] = True
1176        return FieldType.tokenize(self, value, **kwargs)
1177
1178    def process_text(self, qstring, mode='', **kwargs):
1179        kwargs["nomorph"] = True
1180        return FieldType.process_text(self, qstring, mode=mode, **kwargs)
1181
1182
1183class NGRAM(FieldType):
1184    """
1185    Configured field that indexes text as N-grams. For example, with a field
1186    type NGRAM(3,4), the value "hello" will be indexed as tokens
1187    "hel", "hell", "ell", "ello", "llo". This field type chops the entire text
1188    into N-grams, including whitespace and punctuation. See :class:`NGRAMWORDS`
1189    for a field type that breaks the text into words first before chopping the
1190    words into N-grams.
1191    """
1192
1193    scorable = True
1194
1195    def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0,
1196                 queryor=False, phrase=False, sortable=False):
1197        """
1198        :param minsize: The minimum length of the N-grams.
1199        :param maxsize: The maximum length of the N-grams.
1200        :param stored: Whether to store the value of this field with the
1201            document. Since this field type generally contains a lot of text,
1202            you should avoid storing it with the document unless you need to,
1203            for example to allow fast excerpts in the search results.
1204        :param queryor: if True, combine the N-grams with an Or query. The
1205            default is to combine N-grams with an And query.
1206        :param phrase: store positions on the N-grams to allow exact phrase
1207            searching. The default is off.
1208        """
1209
1210        formatclass = formats.Frequency
1211        if phrase:
1212            formatclass = formats.Positions
1213
1214        self.analyzer = analysis.NgramAnalyzer(minsize, maxsize)
1215        self.format = formatclass(field_boost=field_boost)
1216        self.analyzer = analysis.NgramAnalyzer(minsize, maxsize)
1217        self.stored = stored
1218        self.queryor = queryor
1219        self.set_sortable(sortable)
1220
1221    def self_parsing(self):
1222        return True
1223
1224    def parse_query(self, fieldname, qstring, boost=1.0):
1225        from whoosh import query
1226
1227        terms = [query.Term(fieldname, g)
1228                 for g in self.process_text(qstring, mode='query')]
1229        cls = query.Or if self.queryor else query.And
1230
1231        return cls(terms, boost=boost)
1232
1233
1234class NGRAMWORDS(NGRAM):
1235    """
1236    Configured field that chops text into words using a tokenizer,
1237    lowercases the words, and then chops the words into N-grams.
1238    """
1239
1240    scorable = True
1241
1242    def __init__(self, minsize=2, maxsize=4, stored=False, field_boost=1.0,
1243                 tokenizer=None, at=None, queryor=False, sortable=False):
1244        """
1245        :param minsize: The minimum length of the N-grams.
1246        :param maxsize: The maximum length of the N-grams.
1247        :param stored: Whether to store the value of this field with the
1248            document. Since this field type generally contains a lot of text,
1249            you should avoid storing it with the document unless you need to,
1250            for example to allow fast excerpts in the search results.
1251        :param tokenizer: an instance of :class:`whoosh.analysis.Tokenizer`
1252            used to break the text into words.
1253        :param at: if 'start', only takes N-grams from the start of the word.
1254            If 'end', only takes N-grams from the end. Otherwise the default
1255            is to take all N-grams from each word.
1256        :param queryor: if True, combine the N-grams with an Or query. The
1257            default is to combine N-grams with an And query.
1258        """
1259
1260        self.analyzer = analysis.NgramWordAnalyzer(minsize, maxsize, tokenizer,
1261                                                   at=at)
1262        self.format = formats.Frequency(field_boost=field_boost)
1263        self.stored = stored
1264        self.queryor = queryor
1265        self.set_sortable(sortable)
1266
1267
1268# Other fields
1269
1270class ReverseField(FieldWrapper):
1271    def __init__(self, subfield, prefix="rev_"):
1272        FieldWrapper.__init__(self, subfield, prefix)
1273        self.analyzer = subfield.analyzer | analysis.ReverseTextFilter()
1274        self.format = BasicFormat(lengths=False, weights=False)
1275
1276        self.scorable = False
1277        self.set_sortable(False)
1278        self.stored = False
1279        self.unique = False
1280        self.vector = False
1281
1282    def subfields(self):
1283        yield "", self.subfield
1284        yield self.name_prefix, self
1285
1286
1287# Schema class
1288
1289class MetaSchema(type):
1290    def __new__(cls, name, bases, attrs):
1291        super_new = super(MetaSchema, cls).__new__
1292        if not any(b for b in bases if isinstance(b, MetaSchema)):
1293            # If this isn't a subclass of MetaSchema, don't do anything special
1294            return super_new(cls, name, bases, attrs)
1295
1296        # Create the class
1297        special_attrs = {}
1298        for key in list(attrs.keys()):
1299            if key.startswith("__"):
1300                special_attrs[key] = attrs.pop(key)
1301        new_class = super_new(cls, name, bases, special_attrs)
1302
1303        fields = {}
1304        for b in bases:
1305            if hasattr(b, "_clsfields"):
1306                fields.update(b._clsfields)
1307        fields.update(attrs)
1308        new_class._clsfields = fields
1309        return new_class
1310
1311    def schema(self):
1312        return Schema(**self._clsfields)
1313
1314
1315class Schema(object):
1316    """
1317    Represents the collection of fields in an index. Maps field names to
1318    FieldType objects which define the behavior of each field.
1319
1320    Low-level parts of the index use field numbers instead of field names for
1321    compactness. This class has several methods for converting between the
1322    field name, field number, and field object itself.
1323    """
1324
1325    def __init__(self, **fields):
1326        """
1327        All keyword arguments to the constructor are treated as fieldname =
1328        fieldtype pairs. The fieldtype can be an instantiated FieldType object,
1329        or a FieldType sub-class (in which case the Schema will instantiate it
1330        with the default constructor before adding it).
1331
1332        For example::
1333
1334            s = Schema(content = TEXT,
1335                       title = TEXT(stored = True),
1336                       tags = KEYWORD(stored = True))
1337        """
1338
1339        self._fields = {}
1340        self._subfields = {}
1341        self._dyn_fields = {}
1342
1343        for name in sorted(fields.keys()):
1344            self.add(name, fields[name])
1345
1346    def copy(self):
1347        """
1348        Returns a shallow copy of the schema. The field instances are not
1349        deep copied, so they are shared between schema copies.
1350        """
1351
1352        return self.__class__(**self._fields)
1353
1354    def __eq__(self, other):
1355        return (other.__class__ is self.__class__
1356                and list(self.items()) == list(other.items()))
1357
1358    def __ne__(self, other):
1359        return not(self.__eq__(other))
1360
1361    def __repr__(self):
1362        return "<%s: %r>" % (self.__class__.__name__, self.names())
1363
1364    def __iter__(self):
1365        """
1366        Returns the field objects in this schema.
1367        """
1368
1369        return iter(self._fields.values())
1370
1371    def __getitem__(self, name):
1372        """
1373        Returns the field associated with the given field name.
1374        """
1375
1376        # If the name is in the dictionary, just return it
1377        if name in self._fields:
1378            return self._fields[name]
1379
1380        # Check if the name matches a dynamic field
1381        for expr, fieldtype in itervalues(self._dyn_fields):
1382            if expr.match(name):
1383                return fieldtype
1384
1385        raise KeyError("No field named %r" % (name,))
1386
1387    def __len__(self):
1388        """
1389        Returns the number of fields in this schema.
1390        """
1391
1392        return len(self._fields)
1393
1394    def __contains__(self, fieldname):
1395        """
1396        Returns True if a field by the given name is in this schema.
1397        """
1398
1399        # Defined in terms of __getitem__ so that there's only one method to
1400        # override to provide dynamic fields
1401        try:
1402            field = self[fieldname]
1403            return field is not None
1404        except KeyError:
1405            return False
1406
1407    def __setstate__(self, state):
1408        if "_subfields" not in state:
1409            state["_subfields"] = {}
1410        self.__dict__.update(state)
1411
1412    def to_bytes(self, fieldname, value):
1413        return self[fieldname].to_bytes(value)
1414
1415    def items(self):
1416        """
1417        Returns a list of ("fieldname", field_object) pairs for the fields
1418        in this schema.
1419        """
1420
1421        return sorted(self._fields.items())
1422
1423    def names(self, check_names=None):
1424        """
1425        Returns a list of the names of the fields in this schema.
1426
1427        :param check_names: (optional) sequence of field names to check
1428            whether the schema accepts them as (dynamic) field names -
1429            acceptable names will also be in the result list.
1430            Note: You may also have static field names in check_names, that
1431            won't create duplicates in the result list. Unsupported names
1432            will not be in the result list.
1433        """
1434
1435        fieldnames = set(self._fields.keys())
1436        if check_names is not None:
1437            check_names = set(check_names) - fieldnames
1438            fieldnames.update(fieldname for fieldname in check_names
1439                              if fieldname in self)
1440        return sorted(fieldnames)
1441
1442    def clean(self):
1443        for field in self:
1444            field.clean()
1445
1446    def add(self, name, fieldtype, glob=False):
1447        """
1448        Adds a field to this schema.
1449
1450        :param name: The name of the field.
1451        :param fieldtype: An instantiated fields.FieldType object, or a
1452            FieldType subclass. If you pass an instantiated object, the schema
1453            will use that as the field configuration for this field. If you
1454            pass a FieldType subclass, the schema will automatically
1455            instantiate it with the default constructor.
1456        """
1457
1458        # If the user passed a type rather than an instantiated field object,
1459        # instantiate it automatically
1460        if type(fieldtype) is type:
1461            try:
1462                fieldtype = fieldtype()
1463            except:
1464                e = sys.exc_info()[1]
1465                raise FieldConfigurationError("Error: %s instantiating field "
1466                                              "%r: %r" % (e, name, fieldtype))
1467
1468        if not isinstance(fieldtype, FieldType):
1469                raise FieldConfigurationError("%r is not a FieldType object"
1470                                              % fieldtype)
1471
1472        self._subfields[name] = sublist = []
1473        for prefix, subfield in fieldtype.subfields():
1474            fname = prefix + name
1475            sublist.append(fname)
1476
1477            # Check field name
1478            if fname.startswith("_"):
1479                raise FieldConfigurationError("Names cannot start with _")
1480            elif " " in fname:
1481                raise FieldConfigurationError("Names cannot contain spaces")
1482            elif fname in self._fields or (glob and fname in self._dyn_fields):
1483                raise FieldConfigurationError("%r already in schema" % fname)
1484
1485            # Add the field
1486            if glob:
1487                expr = re.compile(fnmatch.translate(name))
1488                self._dyn_fields[fname] = (expr, subfield)
1489            else:
1490                fieldtype.on_add(self, fname)
1491                self._fields[fname] = subfield
1492
1493    def remove(self, fieldname):
1494        if fieldname in self._fields:
1495            self._fields[fieldname].on_remove(self, fieldname)
1496            del self._fields[fieldname]
1497
1498            if fieldname in self._subfields:
1499                for subname in self._subfields[fieldname]:
1500                    if subname in self._fields:
1501                        del self._fields[subname]
1502                del self._subfields[fieldname]
1503
1504        elif fieldname in self._dyn_fields:
1505            del self._dyn_fields[fieldname]
1506
1507        else:
1508            raise KeyError("No field named %r" % fieldname)
1509
1510    def indexable_fields(self, fieldname):
1511        if fieldname in self._subfields:
1512            for subname in self._subfields[fieldname]:
1513                yield subname, self._fields[subname]
1514        else:
1515            # Use __getitem__ here instead of getting it directly from _fields
1516            # because it might be a glob
1517            yield fieldname, self[fieldname]
1518
1519    def has_scorable_fields(self):
1520        return any(ftype.scorable for ftype in self)
1521
1522    def stored_names(self):
1523        """
1524        Returns a list of the names of fields that are stored.
1525        """
1526
1527        return [name for name, field in self.items() if field.stored]
1528
1529    def scorable_names(self):
1530        """
1531        Returns a list of the names of fields that store field
1532        lengths.
1533        """
1534
1535        return [name for name, field in self.items() if field.scorable]
1536
1537
1538class SchemaClass(with_metaclass(MetaSchema, Schema)):
1539    """
1540    Allows you to define a schema using declarative syntax, similar to
1541    Django models::
1542
1543        class MySchema(SchemaClass):
1544            path = ID
1545            date = DATETIME
1546            content = TEXT
1547
1548    You can use inheritance to share common fields between schemas::
1549
1550        class Parent(SchemaClass):
1551            path = ID(stored=True)
1552            date = DATETIME
1553
1554        class Child1(Parent):
1555            content = TEXT(positions=False)
1556
1557        class Child2(Parent):
1558            tags = KEYWORD
1559
1560    This class overrides ``__new__`` so instantiating your sub-class always
1561    results in an instance of ``Schema``.
1562
1563    >>> class MySchema(SchemaClass):
1564    ...     title = TEXT(stored=True)
1565    ...     content = TEXT
1566    ...
1567    >>> s = MySchema()
1568    >>> type(s)
1569    <class 'whoosh.fields.Schema'>
1570
1571    """
1572
1573    def __new__(cls, *args, **kwargs):
1574        obj = super(Schema, cls).__new__(Schema)
1575        kw = getattr(cls, "_clsfields", {})
1576        kw.update(kwargs)
1577        obj.__init__(*args, **kw)
1578        return obj
1579
1580
1581def ensure_schema(schema):
1582    if isinstance(schema, type) and issubclass(schema, Schema):
1583        schema = schema.schema()
1584    if not isinstance(schema, Schema):
1585        raise FieldConfigurationError("%r is not a Schema" % schema)
1586    return schema
1587
1588
1589def merge_fielddict(d1, d2):
1590    keyset = set(d1.keys()) | set(d2.keys())
1591    out = {}
1592    for name in keyset:
1593        field1 = d1.get(name)
1594        field2 = d2.get(name)
1595        if field1 and field2 and field1 != field2:
1596            raise Exception("Inconsistent field %r: %r != %r"
1597                            % (name, field1, field2))
1598        out[name] = field1 or field2
1599    return out
1600
1601
1602def merge_schema(s1, s2):
1603    schema = Schema()
1604    schema._fields = merge_fielddict(s1._fields, s2._fields)
1605    schema._dyn_fields = merge_fielddict(s1._dyn_fields, s2._dyn_fields)
1606    return schema
1607
1608
1609def merge_schemas(schemas):
1610    schema = schemas[0]
1611    for i in xrange(1, len(schemas)):
1612        schema = merge_schema(schema, schemas[i])
1613    return schema
1614