1from __future__ import with_statement
2import random, threading, time
3
4from whoosh import analysis, fields, formats, reading
5from whoosh.compat import b, u, xrange
6from whoosh.reading import SegmentReader
7from whoosh.filedb.filestore import RamStorage
8from whoosh.util.testing import TempIndex
9
10
11def _create_index():
12    s = fields.Schema(f1=fields.KEYWORD(stored=True),
13                      f2=fields.KEYWORD,
14                      f3=fields.KEYWORD)
15    st = RamStorage()
16    ix = st.create_index(s)
17    return ix
18
19
20def _one_segment_index():
21    ix = _create_index()
22    w = ix.writer()
23    w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z"))
24    w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S"))
25    w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S"))
26    w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z"))
27    w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y"))
28    w.commit()
29
30    return ix
31
32
33def _multi_segment_index():
34    ix = _create_index()
35    w = ix.writer()
36    w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z"))
37    w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S"))
38    w.commit()
39
40    w = ix.writer()
41    w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S"))
42    w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z"))
43    w.commit(merge=False)
44
45    w = ix.writer()
46    w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y"))
47    w.commit(merge=False)
48
49    return ix
50
51
52def _stats(r):
53    return [(fname, text, ti.doc_frequency(), ti.weight())
54            for (fname, text), ti in r]
55
56
57def _fstats(r):
58    return [(text, ti.doc_frequency(), ti.weight())
59            for text, ti in r]
60
61
62def test_readers():
63    target = [("f1", b('A'), 4, 6), ("f1", b('B'), 2, 2), ("f1", b('C'), 2, 2),
64              ("f1", b('D'), 1, 1), ("f1", b('E'), 2, 2), ("f1", b('F'), 1, 1),
65              ("f2", b('1'), 3, 3), ("f2", b('2'), 3, 3), ("f2", b('3'), 2, 2),
66              ("f2", b('4'), 2, 2), ("f2", b('5'), 2, 2), ("f2", b('6'), 2, 2),
67              ("f3", b('Q'), 2, 2), ("f3", b('R'), 2, 2), ("f3", b('S'), 2, 2),
68              ("f3", b('X'), 3, 3), ("f3", b('Y'), 3, 3), ("f3", b('Z'), 2, 2)]
69    target = sorted(target)
70
71    stored = [{"f1": "A B C"}, {"f1": "D E F"}, {"f1": "A E C"},
72              {"f1": "A A A"}, {"f1": "A B"}]
73
74    def t(ix):
75        r = ix.reader()
76        assert list(r.all_stored_fields()) == stored
77        assert sorted(_stats(r)) == target
78
79    ix = _one_segment_index()
80    assert len(ix._segments()) == 1
81    t(ix)
82
83    ix = _multi_segment_index()
84    assert len(ix._segments()) == 3
85    t(ix)
86
87
88def test_term_inspection():
89    schema = fields.Schema(title=fields.TEXT(stored=True),
90                           content=fields.TEXT)
91    with TempIndex(schema) as ix:
92        with ix.writer() as w:
93            w.add_document(
94                title=u("My document"),
95                content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE")
96            )
97            w.add_document(
98                title=u("My other document"),
99                content=u("AA AB BB CC EE EE AX AX DD")
100            )
101
102        with ix.reader() as r:
103            cterms = " ".join(r.field_terms("content"))
104            assert cterms == "aa ab ax bb cc dd ee"
105
106            a_exp = list(r.expand_prefix("content", "a"))
107            assert a_exp == [b('aa'), b('ab'), b('ax')]
108
109            assert set(r.all_terms()) == set([
110                ('content', b('aa')), ('content', b('ab')),
111                ('content', b('ax')), ('content', b('bb')),
112                ('content', b('cc')), ('content', b('dd')),
113                ('content', b('ee')), ('title', b('document')),
114                ('title', b('my')), ('title', b('other'))
115            ])
116
117            # (text, doc_freq, index_freq)
118            cstats = _fstats(r.iter_field("content"))
119            assert cstats == [
120                (b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2),
121                (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2),
122                (b('ee'), 2, 4)
123            ]
124
125            prestats = _fstats(r.iter_field("content", prefix="c"))
126            assert prestats == [
127                (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)
128            ]
129
130            assert list(r.most_frequent_terms("content")) == [
131                (6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')),
132                (2, b('dd'))
133            ]
134            assert list(r.most_frequent_terms("content", prefix="a")) == [
135                (6, b('aa')), (2, b('ax')), (1, b('ab'))
136            ]
137            assert list(r.most_distinctive_terms("content", 3)) == [
138                (1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')),
139                (0.0, b('ee'))
140            ]
141
142
143def test_vector_postings():
144    s = fields.Schema(id=fields.ID(stored=True, unique=True),
145                      content=fields.TEXT(vector=formats.Positions()))
146    st = RamStorage()
147    ix = st.create_index(s)
148
149    writer = ix.writer()
150    writer.add_document(id=u('1'),
151                        content=u('the quick brown fox jumped over the ' +
152                                  'lazy dogs'))
153    writer.commit()
154    r = ix.reader()
155
156    terms = list(r.vector_as("weight", 0, "content"))
157    assert terms == [(u('brown'), 1.0), (u('dogs'), 1.0), (u('fox'), 1.0),
158                     (u('jumped'), 1.0), (u('lazy'), 1.0),
159                     (u('over'), 1.0), (u('quick'), 1.0)]
160
161
162def test_stored_fields():
163    s = fields.Schema(a=fields.ID(stored=True), b=fields.STORED,
164                      c=fields.KEYWORD, d=fields.TEXT(stored=True))
165    st = RamStorage()
166    ix = st.create_index(s)
167
168    writer = ix.writer()
169    writer.add_document(a=u("1"), b="a", c=u("zulu"), d=u("Alfa"))
170    writer.add_document(a=u("2"), b="b", c=u("yankee"), d=u("Bravo"))
171    writer.add_document(a=u("3"), b="c", c=u("xray"), d=u("Charlie"))
172    writer.commit()
173
174    with ix.searcher() as sr:
175        assert sr.stored_fields(0) == {"a": u("1"), "b": "a", "d": u("Alfa")}
176        assert sr.stored_fields(2) == {"a": u("3"), "b": "c", "d": u("Charlie")}
177
178        assert sr.document(a=u("1")) == {"a": u("1"), "b": "a", "d": u("Alfa")}
179        assert sr.document(a=u("2")) == {"a": u("2"), "b": "b", "d": u("Bravo")}
180
181
182def test_stored_fields2():
183    schema = fields.Schema(content=fields.TEXT(stored=True),
184                           title=fields.TEXT(stored=True),
185                           summary=fields.STORED,
186                           path=fields.ID(stored=True))
187
188    storedkeys = ["content", "path", "summary", "title"]
189    assert storedkeys == schema.stored_names()
190
191    ix = RamStorage().create_index(schema)
192
193    writer = ix.writer()
194    writer.add_document(content=u("Content of this document."),
195                        title=u("This is the title"),
196                        summary=u("This is the summary"), path=u("/main"))
197    writer.add_document(content=u("Second document."), title=u("Second title"),
198                        summary=u("Summary numero due"), path=u("/second"))
199    writer.add_document(content=u("Third document."), title=u("Title 3"),
200                        summary=u("Summary treo"), path=u("/san"))
201    writer.commit()
202
203    with ix.searcher() as s:
204        doc = s.document(path="/main")
205        assert doc is not None
206        assert ([doc[k] for k in sorted(doc.keys())]
207                == ["Content of this document.", "/main",
208                    "This is the summary", "This is the title"])
209
210    ix.close()
211
212
213def test_all_stored_fields():
214    # all_stored_fields() should yield all stored fields, even for deleted
215    # documents
216
217    schema = fields.Schema(a=fields.ID(stored=True), b=fields.STORED)
218    ix = RamStorage().create_index(schema)
219    with ix.writer() as w:
220        w.add_document(a=u("alfa"), b=u("bravo"))
221        w.add_document(a=u("apple"), b=u("bear"))
222        w.add_document(a=u("alpaca"), b=u("beagle"))
223        w.add_document(a=u("aim"), b=u("box"))
224
225    w = ix.writer()
226    w.delete_by_term("a", "apple")
227    w.delete_by_term("a", "aim")
228    w.commit(merge=False)
229
230    with ix.searcher() as s:
231        assert s.doc_count_all() == 4
232        assert s.doc_count() == 2
233        sfs = list((sf["a"], sf["b"]) for sf in s.all_stored_fields())
234        assert sfs == [("alfa", "bravo"), ("alpaca", "beagle")]
235
236
237def test_first_id():
238    schema = fields.Schema(path=fields.ID(stored=True))
239    ix = RamStorage().create_index(schema)
240
241    w = ix.writer()
242    w.add_document(path=u("/a"))
243    w.add_document(path=u("/b"))
244    w.add_document(path=u("/c"))
245    w.commit()
246
247    r = ix.reader()
248    docid = r.first_id("path", u("/b"))
249    assert r.stored_fields(docid) == {"path": "/b"}
250
251    ix = RamStorage().create_index(schema)
252    w = ix.writer()
253    w.add_document(path=u("/a"))
254    w.add_document(path=u("/b"))
255    w.add_document(path=u("/c"))
256    w.commit(merge=False)
257
258    w = ix.writer()
259    w.add_document(path=u("/d"))
260    w.add_document(path=u("/e"))
261    w.add_document(path=u("/f"))
262    w.commit(merge=False)
263
264    w = ix.writer()
265    w.add_document(path=u("/g"))
266    w.add_document(path=u("/h"))
267    w.add_document(path=u("/i"))
268    w.commit(merge=False)
269
270    r = ix.reader()
271    assert r.__class__ == reading.MultiReader
272    docid = r.first_id("path", u("/e"))
273    assert r.stored_fields(docid) == {"path": "/e"}
274
275
276class RecoverReader(threading.Thread):
277    def __init__(self, ix):
278        threading.Thread.__init__(self)
279        self.ix = ix
280
281    def run(self):
282        for _ in xrange(50):
283            r = self.ix.reader()
284            r.close()
285
286
287class RecoverWriter(threading.Thread):
288    domain = u("alfa bravo charlie deleta echo foxtrot golf hotel india")
289    domain = domain.split()
290
291    def __init__(self, ix):
292        threading.Thread.__init__(self)
293        self.ix = ix
294
295    def run(self):
296        for _ in xrange(10):
297            w = self.ix.writer()
298            w.add_document(text=random.sample(self.domain, 4))
299            w.commit()
300            time.sleep(0.01)
301
302
303def test_delete_recovery():
304    schema = fields.Schema(text=fields.TEXT)
305    with TempIndex(schema, "delrecover") as ix:
306        rw = RecoverWriter(ix)
307        rr = RecoverReader(ix)
308        rw.start()
309        rr.start()
310        rw.join()
311        rr.join()
312
313
314def test_nonexclusive_read():
315    schema = fields.Schema(text=fields.TEXT)
316    with TempIndex(schema, "readlock") as ix:
317        for num in u("one two three four five").split():
318            w = ix.writer()
319            w.add_document(text=u("Test document %s") % num)
320            w.commit(merge=False)
321
322        def fn():
323            for _ in xrange(5):
324                r = ix.reader()
325                assert list(r.field_terms("text")) == ["document", "five", "four", "one", "test", "three", "two"]
326                r.close()
327
328        ths = [threading.Thread(target=fn) for _ in xrange(5)]
329        for th in ths:
330            th.start()
331        for th in ths:
332            th.join()
333
334
335def test_doc_count():
336    schema = fields.Schema(id=fields.NUMERIC)
337    ix = RamStorage().create_index(schema)
338    with ix.writer() as w:
339        for i in xrange(10):
340            w.add_document(id=i)
341
342    r = ix.reader()
343    assert r.doc_count() == 10
344    assert r.doc_count_all() == 10
345
346    w = ix.writer()
347    w.delete_document(2)
348    w.delete_document(4)
349    w.delete_document(6)
350    w.delete_document(8)
351    w.commit()
352
353    r = ix.reader()
354    assert r.doc_count() == 6
355    assert r.doc_count_all() == 10
356
357    w = ix.writer()
358    for i in xrange(10, 15):
359        w.add_document(id=i)
360    w.commit(merge=False)
361
362    r = ix.reader()
363    assert r.doc_count() == 11
364    assert r.doc_count_all() == 15
365
366    w = ix.writer()
367    w.delete_document(10)
368    w.delete_document(12)
369    w.delete_document(14)
370    w.commit(merge=False)
371
372    r = ix.reader()
373    assert r.doc_count() == 8
374    assert r.doc_count_all() == 15
375
376    ix.optimize()
377    r = ix.reader()
378    assert r.doc_count() == 8
379    assert r.doc_count_all() == 8
380
381
382def test_reader_subclasses():
383    from whoosh.util.testing import check_abstract_methods
384
385    check_abstract_methods(reading.IndexReader, SegmentReader)
386    check_abstract_methods(reading.IndexReader, reading.MultiReader)
387    check_abstract_methods(reading.IndexReader, reading.EmptyReader)
388
389
390def test_cursor():
391    schema = fields.Schema(text=fields.TEXT)
392    with TempIndex(schema) as ix:
393        with ix.writer() as w:
394            w.add_document(text=u"papa quebec romeo sierra tango")
395            w.add_document(text=u"foxtrot golf hotel india juliet")
396            w.add_document(text=u"alfa bravo charlie delta echo")
397            w.add_document(text=u"uniform victor whiskey x-ray")
398            w.add_document(text=u"kilo lima mike november oskar")
399            w.add_document(text=u"charlie alfa alfa bravo bravo bravo")
400
401        with ix.reader() as r:
402            cur = r.cursor("text")
403            assert cur.text() == "alfa"
404            assert cur.next() == "bravo"
405            assert cur.text() == "bravo"
406
407            assert cur.find(b"inc") == "india"
408            assert cur.text() == "india"
409
410            cur.first() == "alfa"
411            assert cur.text() == "alfa"
412
413            assert cur.find(b"zulu") is None
414            assert cur.text() is None
415            assert not cur.is_valid()
416
417            assert cur.find(b"a") == "alfa"
418            assert cur.term_info().weight() == 3
419            assert cur.next() == "bravo"
420            assert cur.term_info().weight() == 4
421            assert cur.next() == "charlie"
422            assert cur.term_info().weight() == 2
423
424
425def _check_inspection_results(ix):
426    AE = u'aé'.encode('utf-8')
427    AU = u'aú'.encode('utf-8')
428
429    with ix.reader() as r:
430        cterms = " ".join(r.field_terms("content"))
431        assert cterms == u"aa aé aú bb cc dd ee"
432
433        a_exp = list(r.expand_prefix("content", "a"))
434        assert a_exp == [b('aa'), AE, AU]
435
436        tset = set(r.all_terms())
437        assert tset == set([
438            ('content', b('aa')), ('content', AE),
439            ('content', AU), ('content', b('bb')),
440            ('content', b('cc')), ('content', b('dd')),
441            ('content', b('ee')), ('title', b('document')),
442            ('title', b('my')), ('title', b('other'))
443        ])
444
445        # (text, doc_freq, index_freq)
446        assert _fstats(r.iter_field("content")) == [
447            (b('aa'), 2, 6), (AE, 1, 1), (AU, 1, 2), (b('bb'), 2, 5),
448            (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)
449        ]
450        assert _fstats(r.iter_field("content", prefix="c")) == [
451            (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4)
452        ]
453
454        assert list(r.most_frequent_terms("content")) == [
455            (6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')),
456            (2, b('dd'))
457        ]
458        assert list(r.most_frequent_terms("content", prefix="a")) == [
459            (6, b('aa')), (2, AU), (1, AE)
460        ]
461        assert list(r.most_distinctive_terms("content", 3)) == [
462            (1.3862943611198906, AU), (0.6931471805599453, AE), (0.0, b('ee'))
463        ]
464
465
466def test_term_inspection_segment_reader():
467    schema = fields.Schema(title=fields.TEXT(stored=True),
468                           content=fields.TEXT)
469    with TempIndex(schema) as ix:
470        with ix.writer() as w:
471            w.add_document(
472                title=u"My document",
473                content=u"AA AA BB BB CC AA AA AA BB BB CC DD EE EE"
474            )
475            w.add_document(
476                title=u"My other document",
477                content=u"AA AÉ BB CC EE EE Aú AÚ DD"
478            )
479
480        _check_inspection_results(ix)
481
482
483def test_term_inspection_multi_reader():
484    schema = fields.Schema(title=fields.TEXT(stored=True),
485                           content=fields.TEXT)
486    with TempIndex(schema) as ix:
487        with ix.writer() as w:
488            w.add_document(
489                title=u"My document",
490                content=u"AA AA BB BB CC AA AA AA BB BB CC DD EE EE"
491            )
492
493        with ix.writer() as w:
494            w.add_document(
495                title=u"My other document",
496                content=u"AA AÉ BB CC EE EE Aú AÚ DD"
497            )
498            w.merge = False
499
500        _check_inspection_results(ix)
501