1from __future__ import with_statement 2 3from whoosh import analysis, fields 4from whoosh.compat import xrange, u 5from whoosh.codec import default_codec 6from whoosh.formats import Existence, Frequency 7from whoosh.formats import Positions, PositionBoosts 8from whoosh.formats import Characters, CharacterBoosts 9from whoosh.util.testing import TempStorage 10 11 12def _roundtrip(content, format_, astype, ana=None): 13 with TempStorage("roundtrip") as st: 14 codec = default_codec() 15 seg = codec.new_segment(st, "") 16 ana = ana or analysis.StandardAnalyzer() 17 field = fields.FieldType(format=format_, analyzer=ana) 18 19 fw = codec.field_writer(st, seg) 20 fw.start_field("f1", field) 21 for text, _, weight, valuestring in sorted(field.index(content)): 22 fw.start_term(text) 23 fw.add(0, weight, valuestring, None) 24 fw.finish_term() 25 fw.finish_field() 26 fw.close() 27 28 tr = codec.terms_reader(st, seg) 29 ps = [] 30 for fieldname, btext in tr.terms(): 31 m = tr.matcher(fieldname, btext, format_) 32 ps.append((field.from_bytes(btext), m.value_as(astype))) 33 tr.close() 34 return ps 35 36 37def test_existence_postings(): 38 content = u("alfa bravo charlie") 39 assert _roundtrip(content, Existence(), "frequency") == [("alfa", 1), ("bravo", 1), ("charlie", 1)] 40 41 42def test_frequency_postings(): 43 content = u("alfa bravo charlie bravo alfa alfa") 44 assert _roundtrip(content, Frequency(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] 45 46 47def test_position_postings(): 48 content = u("alfa bravo charlie bravo alfa alfa") 49 assert _roundtrip(content, Positions(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] 50 assert _roundtrip(content, Positions(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] 51 52 53def test_character_postings(): 54 content = u("alfa bravo charlie bravo alfa alfa") 55 assert _roundtrip(content, Characters(), "characters") == [("alfa", [(0, 0, 4), (4, 25, 29), (5, 30, 34)]), 56 ("bravo", [(1, 5, 10), (3, 19, 24)]), 57 ("charlie", [(2, 11, 18)])] 58 assert _roundtrip(content, Characters(), "positions") == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] 59 assert _roundtrip(content, Characters(), "frequency") == [("alfa", 3), ("bravo", 2), ("charlie", 1)] 60 61 62def test_posboost_postings(): 63 pbs = PositionBoosts() 64 ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() 65 content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") 66 assert _roundtrip(content, pbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), 67 ("bravo", [(1, 0.1), (3, 0.5)]), 68 ("charlie", [(2, 2)])] 69 assert _roundtrip(content, pbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] 70 assert _roundtrip(content, pbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)] 71 72 73def test_charboost_postings(): 74 cbs = CharacterBoosts() 75 ana = analysis.RegexTokenizer(r"\S+") | analysis.DelimitedAttributeFilter() 76 content = u("alfa^2 bravo^0.1 charlie^2 bravo^0.5 alfa alfa") 77 assert _roundtrip(content, cbs, "character_boosts", ana) == [("alfa", [(0, 0, 4, 2), (4, 37, 41, 1), (5, 42, 46, 1)]), 78 ("bravo", [(1, 7, 12, 0.1), (3, 27, 32, 0.5)]), 79 ("charlie", [(2, 17, 24, 2)])] 80 assert _roundtrip(content, cbs, "position_boosts", ana) == [("alfa", [(0, 2), (4, 1), (5, 1)]), 81 ("bravo", [(1, 0.1), (3, 0.5)]), 82 ("charlie", [(2, 2)])] 83 assert _roundtrip(content, cbs, "characters", ana) == [("alfa", [(0, 0, 4), (4, 37, 41), (5, 42, 46)]), 84 ("bravo", [(1, 7, 12), (3, 27, 32)]), 85 ("charlie", [(2, 17, 24)])] 86 assert _roundtrip(content, cbs, "positions", ana) == [("alfa", [0, 4, 5]), ("bravo", [1, 3]), ("charlie", [2])] 87 assert _roundtrip(content, cbs, "frequency", ana) == [("alfa", 3), ("bravo", 2), ("charlie", 1)] 88