1from __future__ import with_statement
2
3from whoosh import analysis, fields
4from whoosh.compat import xrange, u
5from whoosh.codec import default_codec
6from whoosh.formats import Existence, Frequency
7from whoosh.formats import Positions, PositionBoosts
8from whoosh.formats import Characters, CharacterBoosts
9from whoosh.util.testing import TempStorage
10
11
12def _roundtrip(content, format_, astype, ana=None):
13    with TempStorage("roundtrip") as st:
14        codec = default_codec()
15        seg = codec.new_segment(st, "")
16        ana = ana or analysis.StandardAnalyzer()
17        field = fields.FieldType(format=format_, analyzer=ana)
18
19        fw = codec.field_writer(st, seg)
20        fw.start_field("f1", field)
21        for text, _, weight, valuestring in sorted(field.index(content)):
22            fw.start_term(text)
23            fw.add(0, weight, valuestring, None)
24            fw.finish_term()
25        fw.finish_field()
26        fw.close()
27
28        tr = codec.terms_reader(st, seg)
29        ps = []
30        for fieldname, btext in tr.terms():
31            m = tr.matcher(fieldname, btext, format_)
32            ps.append((field.from_bytes(btext), m.value_as(astype)))
33        tr.close()
34        return ps
35
36
37def test_existence_postings():
38    content = u("alfa bravo charlie")
39    assert _roundtrip(content, Existence(), "frequency") == [("alfa", 1), ("bravo", 1), ("charlie", 1)]
40
41
42def test_frequency_postings():
43    content = u("alfa bravo charlie bravo alfa alfa")
44    assert _roundtrip(content, Frequency(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
45
46
47def test_position_postings():
48    content = u("alfa bravo charlie bravo alfa alfa")
49    assert _roundtrip(content, Positions(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
50    assert _roundtrip(content, Positions(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
51
52
53def test_character_postings():
54    content = u("alfa bravo charlie bravo alfa alfa")
55    assert _roundtrip(content, Characters(), "characters") == [("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]),
56                                                               ("bravo", [(1, 5, 10), (3, 19, 24)]),
57                                                               ("charlie", [(2, 11, 18)])]
58    assert _roundtrip(content, Characters(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
59    assert _roundtrip(content, Characters(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
60
61
62def test_posboost_postings():
63    pbs = PositionBoosts()
64    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
65    content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa")
66    assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]),
67                                                                ("bravo", [(1, 0.1), (3, 0.5)]),
68                                                                ("charlie", [(2, 2)])]
69    assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
70    assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
71
72
73def test_charboost_postings():
74    cbs = CharacterBoosts()
75    ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter()
76    content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa")
77    assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]),
78                                                                 ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]),
79                                                                 ("charlie", [(2, 17, 24, 2)])]
80    assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]),
81                                                                ("bravo", [(1, 0.1), (3, 0.5)]),
82                                                                ("charlie", [(2, 2)])]
83    assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]),
84                                                           ("bravo", [(1, 7, 12), (3, 27, 32)]),
85                                                           ("charlie", [(2, 17, 24)])]
86    assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])]
87    assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)]
88