1from __future__ import with_statement 2import random, threading, time 3 4from whoosh import analysis, fields, formats, reading 5from whoosh.compat import b, u, xrange 6from whoosh.reading import SegmentReader 7from whoosh.filedb.filestore import RamStorage 8from whoosh.util.testing import TempIndex 9 10 11def _create_index(): 12 s = fields.Schema(f1=fields.KEYWORD(stored=True), 13 f2=fields.KEYWORD, 14 f3=fields.KEYWORD) 15 st = RamStorage() 16 ix = st.create_index(s) 17 return ix 18 19 20def _one_segment_index(): 21 ix = _create_index() 22 w = ix.writer() 23 w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) 24 w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) 25 w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) 26 w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) 27 w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) 28 w.commit() 29 30 return ix 31 32 33def _multi_segment_index(): 34 ix = _create_index() 35 w = ix.writer() 36 w.add_document(f1=u("A B C"), f2=u("1 2 3"), f3=u("X Y Z")) 37 w.add_document(f1=u("D E F"), f2=u("4 5 6"), f3=u("Q R S")) 38 w.commit() 39 40 w = ix.writer() 41 w.add_document(f1=u("A E C"), f2=u("1 4 6"), f3=u("X Q S")) 42 w.add_document(f1=u("A A A"), f2=u("2 3 5"), f3=u("Y R Z")) 43 w.commit(merge=False) 44 45 w = ix.writer() 46 w.add_document(f1=u("A B"), f2=u("1 2"), f3=u("X Y")) 47 w.commit(merge=False) 48 49 return ix 50 51 52def _stats(r): 53 return [(fname, text, ti.doc_frequency(), ti.weight()) 54 for (fname, text), ti in r] 55 56 57def _fstats(r): 58 return [(text, ti.doc_frequency(), ti.weight()) 59 for text, ti in r] 60 61 62def test_readers(): 63 target = [("f1", b('A'), 4, 6), ("f1", b('B'), 2, 2), ("f1", b('C'), 2, 2), 64 ("f1", b('D'), 1, 1), ("f1", b('E'), 2, 2), ("f1", b('F'), 1, 1), 65 ("f2", b('1'), 3, 3), ("f2", b('2'), 3, 3), ("f2", b('3'), 2, 2), 66 ("f2", b('4'), 2, 2), ("f2", b('5'), 2, 2), ("f2", b('6'), 2, 2), 67 ("f3", b('Q'), 2, 2), ("f3", b('R'), 2, 2), ("f3", b('S'), 2, 2), 68 ("f3", b('X'), 3, 3), ("f3", b('Y'), 3, 3), ("f3", b('Z'), 2, 2)] 69 target = sorted(target) 70 71 stored = [{"f1": "A B C"}, {"f1": "D E F"}, {"f1": "A E C"}, 72 {"f1": "A A A"}, {"f1": "A B"}] 73 74 def t(ix): 75 r = ix.reader() 76 assert list(r.all_stored_fields()) == stored 77 assert sorted(_stats(r)) == target 78 79 ix = _one_segment_index() 80 assert len(ix._segments()) == 1 81 t(ix) 82 83 ix = _multi_segment_index() 84 assert len(ix._segments()) == 3 85 t(ix) 86 87 88def test_term_inspection(): 89 schema = fields.Schema(title=fields.TEXT(stored=True), 90 content=fields.TEXT) 91 with TempIndex(schema) as ix: 92 with ix.writer() as w: 93 w.add_document( 94 title=u("My document"), 95 content=u("AA AA BB BB CC AA AA AA BB BB CC DD EE EE") 96 ) 97 w.add_document( 98 title=u("My other document"), 99 content=u("AA AB BB CC EE EE AX AX DD") 100 ) 101 102 with ix.reader() as r: 103 cterms = " ".join(r.field_terms("content")) 104 assert cterms == "aa ab ax bb cc dd ee" 105 106 a_exp = list(r.expand_prefix("content", "a")) 107 assert a_exp == [b('aa'), b('ab'), b('ax')] 108 109 assert set(r.all_terms()) == set([ 110 ('content', b('aa')), ('content', b('ab')), 111 ('content', b('ax')), ('content', b('bb')), 112 ('content', b('cc')), ('content', b('dd')), 113 ('content', b('ee')), ('title', b('document')), 114 ('title', b('my')), ('title', b('other')) 115 ]) 116 117 # (text, doc_freq, index_freq) 118 cstats = _fstats(r.iter_field("content")) 119 assert cstats == [ 120 (b('aa'), 2, 6), (b('ab'), 1, 1), (b('ax'), 1, 2), 121 (b('bb'), 2, 5), (b('cc'), 2, 3), (b('dd'), 2, 2), 122 (b('ee'), 2, 4) 123 ] 124 125 prestats = _fstats(r.iter_field("content", prefix="c")) 126 assert prestats == [ 127 (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4) 128 ] 129 130 assert list(r.most_frequent_terms("content")) == [ 131 (6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), 132 (2, b('dd')) 133 ] 134 assert list(r.most_frequent_terms("content", prefix="a")) == [ 135 (6, b('aa')), (2, b('ax')), (1, b('ab')) 136 ] 137 assert list(r.most_distinctive_terms("content", 3)) == [ 138 (1.3862943611198906, b('ax')), (0.6931471805599453, b('ab')), 139 (0.0, b('ee')) 140 ] 141 142 143def test_vector_postings(): 144 s = fields.Schema(id=fields.ID(stored=True, unique=True), 145 content=fields.TEXT(vector=formats.Positions())) 146 st = RamStorage() 147 ix = st.create_index(s) 148 149 writer = ix.writer() 150 writer.add_document(id=u('1'), 151 content=u('the quick brown fox jumped over the ' + 152 'lazy dogs')) 153 writer.commit() 154 r = ix.reader() 155 156 terms = list(r.vector_as("weight", 0, "content")) 157 assert terms == [(u('brown'), 1.0), (u('dogs'), 1.0), (u('fox'), 1.0), 158 (u('jumped'), 1.0), (u('lazy'), 1.0), 159 (u('over'), 1.0), (u('quick'), 1.0)] 160 161 162def test_stored_fields(): 163 s = fields.Schema(a=fields.ID(stored=True), b=fields.STORED, 164 c=fields.KEYWORD, d=fields.TEXT(stored=True)) 165 st = RamStorage() 166 ix = st.create_index(s) 167 168 writer = ix.writer() 169 writer.add_document(a=u("1"), b="a", c=u("zulu"), d=u("Alfa")) 170 writer.add_document(a=u("2"), b="b", c=u("yankee"), d=u("Bravo")) 171 writer.add_document(a=u("3"), b="c", c=u("xray"), d=u("Charlie")) 172 writer.commit() 173 174 with ix.searcher() as sr: 175 assert sr.stored_fields(0) == {"a": u("1"), "b": "a", "d": u("Alfa")} 176 assert sr.stored_fields(2) == {"a": u("3"), "b": "c", "d": u("Charlie")} 177 178 assert sr.document(a=u("1")) == {"a": u("1"), "b": "a", "d": u("Alfa")} 179 assert sr.document(a=u("2")) == {"a": u("2"), "b": "b", "d": u("Bravo")} 180 181 182def test_stored_fields2(): 183 schema = fields.Schema(content=fields.TEXT(stored=True), 184 title=fields.TEXT(stored=True), 185 summary=fields.STORED, 186 path=fields.ID(stored=True)) 187 188 storedkeys = ["content", "path", "summary", "title"] 189 assert storedkeys == schema.stored_names() 190 191 ix = RamStorage().create_index(schema) 192 193 writer = ix.writer() 194 writer.add_document(content=u("Content of this document."), 195 title=u("This is the title"), 196 summary=u("This is the summary"), path=u("/main")) 197 writer.add_document(content=u("Second document."), title=u("Second title"), 198 summary=u("Summary numero due"), path=u("/second")) 199 writer.add_document(content=u("Third document."), title=u("Title 3"), 200 summary=u("Summary treo"), path=u("/san")) 201 writer.commit() 202 203 with ix.searcher() as s: 204 doc = s.document(path="/main") 205 assert doc is not None 206 assert ([doc[k] for k in sorted(doc.keys())] 207 == ["Content of this document.", "/main", 208 "This is the summary", "This is the title"]) 209 210 ix.close() 211 212 213def test_all_stored_fields(): 214 # all_stored_fields() should yield all stored fields, even for deleted 215 # documents 216 217 schema = fields.Schema(a=fields.ID(stored=True), b=fields.STORED) 218 ix = RamStorage().create_index(schema) 219 with ix.writer() as w: 220 w.add_document(a=u("alfa"), b=u("bravo")) 221 w.add_document(a=u("apple"), b=u("bear")) 222 w.add_document(a=u("alpaca"), b=u("beagle")) 223 w.add_document(a=u("aim"), b=u("box")) 224 225 w = ix.writer() 226 w.delete_by_term("a", "apple") 227 w.delete_by_term("a", "aim") 228 w.commit(merge=False) 229 230 with ix.searcher() as s: 231 assert s.doc_count_all() == 4 232 assert s.doc_count() == 2 233 sfs = list((sf["a"], sf["b"]) for sf in s.all_stored_fields()) 234 assert sfs == [("alfa", "bravo"), ("alpaca", "beagle")] 235 236 237def test_first_id(): 238 schema = fields.Schema(path=fields.ID(stored=True)) 239 ix = RamStorage().create_index(schema) 240 241 w = ix.writer() 242 w.add_document(path=u("/a")) 243 w.add_document(path=u("/b")) 244 w.add_document(path=u("/c")) 245 w.commit() 246 247 r = ix.reader() 248 docid = r.first_id("path", u("/b")) 249 assert r.stored_fields(docid) == {"path": "/b"} 250 251 ix = RamStorage().create_index(schema) 252 w = ix.writer() 253 w.add_document(path=u("/a")) 254 w.add_document(path=u("/b")) 255 w.add_document(path=u("/c")) 256 w.commit(merge=False) 257 258 w = ix.writer() 259 w.add_document(path=u("/d")) 260 w.add_document(path=u("/e")) 261 w.add_document(path=u("/f")) 262 w.commit(merge=False) 263 264 w = ix.writer() 265 w.add_document(path=u("/g")) 266 w.add_document(path=u("/h")) 267 w.add_document(path=u("/i")) 268 w.commit(merge=False) 269 270 r = ix.reader() 271 assert r.__class__ == reading.MultiReader 272 docid = r.first_id("path", u("/e")) 273 assert r.stored_fields(docid) == {"path": "/e"} 274 275 276class RecoverReader(threading.Thread): 277 def __init__(self, ix): 278 threading.Thread.__init__(self) 279 self.ix = ix 280 281 def run(self): 282 for _ in xrange(50): 283 r = self.ix.reader() 284 r.close() 285 286 287class RecoverWriter(threading.Thread): 288 domain = u("alfa bravo charlie deleta echo foxtrot golf hotel india") 289 domain = domain.split() 290 291 def __init__(self, ix): 292 threading.Thread.__init__(self) 293 self.ix = ix 294 295 def run(self): 296 for _ in xrange(10): 297 w = self.ix.writer() 298 w.add_document(text=random.sample(self.domain, 4)) 299 w.commit() 300 time.sleep(0.01) 301 302 303def test_delete_recovery(): 304 schema = fields.Schema(text=fields.TEXT) 305 with TempIndex(schema, "delrecover") as ix: 306 rw = RecoverWriter(ix) 307 rr = RecoverReader(ix) 308 rw.start() 309 rr.start() 310 rw.join() 311 rr.join() 312 313 314def test_nonexclusive_read(): 315 schema = fields.Schema(text=fields.TEXT) 316 with TempIndex(schema, "readlock") as ix: 317 for num in u("one two three four five").split(): 318 w = ix.writer() 319 w.add_document(text=u("Test document %s") % num) 320 w.commit(merge=False) 321 322 def fn(): 323 for _ in xrange(5): 324 r = ix.reader() 325 assert list(r.field_terms("text")) == ["document", "five", "four", "one", "test", "three", "two"] 326 r.close() 327 328 ths = [threading.Thread(target=fn) for _ in xrange(5)] 329 for th in ths: 330 th.start() 331 for th in ths: 332 th.join() 333 334 335def test_doc_count(): 336 schema = fields.Schema(id=fields.NUMERIC) 337 ix = RamStorage().create_index(schema) 338 with ix.writer() as w: 339 for i in xrange(10): 340 w.add_document(id=i) 341 342 r = ix.reader() 343 assert r.doc_count() == 10 344 assert r.doc_count_all() == 10 345 346 w = ix.writer() 347 w.delete_document(2) 348 w.delete_document(4) 349 w.delete_document(6) 350 w.delete_document(8) 351 w.commit() 352 353 r = ix.reader() 354 assert r.doc_count() == 6 355 assert r.doc_count_all() == 10 356 357 w = ix.writer() 358 for i in xrange(10, 15): 359 w.add_document(id=i) 360 w.commit(merge=False) 361 362 r = ix.reader() 363 assert r.doc_count() == 11 364 assert r.doc_count_all() == 15 365 366 w = ix.writer() 367 w.delete_document(10) 368 w.delete_document(12) 369 w.delete_document(14) 370 w.commit(merge=False) 371 372 r = ix.reader() 373 assert r.doc_count() == 8 374 assert r.doc_count_all() == 15 375 376 ix.optimize() 377 r = ix.reader() 378 assert r.doc_count() == 8 379 assert r.doc_count_all() == 8 380 381 382def test_reader_subclasses(): 383 from whoosh.util.testing import check_abstract_methods 384 385 check_abstract_methods(reading.IndexReader, SegmentReader) 386 check_abstract_methods(reading.IndexReader, reading.MultiReader) 387 check_abstract_methods(reading.IndexReader, reading.EmptyReader) 388 389 390def test_cursor(): 391 schema = fields.Schema(text=fields.TEXT) 392 with TempIndex(schema) as ix: 393 with ix.writer() as w: 394 w.add_document(text=u"papa quebec romeo sierra tango") 395 w.add_document(text=u"foxtrot golf hotel india juliet") 396 w.add_document(text=u"alfa bravo charlie delta echo") 397 w.add_document(text=u"uniform victor whiskey x-ray") 398 w.add_document(text=u"kilo lima mike november oskar") 399 w.add_document(text=u"charlie alfa alfa bravo bravo bravo") 400 401 with ix.reader() as r: 402 cur = r.cursor("text") 403 assert cur.text() == "alfa" 404 assert cur.next() == "bravo" 405 assert cur.text() == "bravo" 406 407 assert cur.find(b"inc") == "india" 408 assert cur.text() == "india" 409 410 cur.first() == "alfa" 411 assert cur.text() == "alfa" 412 413 assert cur.find(b"zulu") is None 414 assert cur.text() is None 415 assert not cur.is_valid() 416 417 assert cur.find(b"a") == "alfa" 418 assert cur.term_info().weight() == 3 419 assert cur.next() == "bravo" 420 assert cur.term_info().weight() == 4 421 assert cur.next() == "charlie" 422 assert cur.term_info().weight() == 2 423 424 425def _check_inspection_results(ix): 426 AE = u'aé'.encode('utf-8') 427 AU = u'aú'.encode('utf-8') 428 429 with ix.reader() as r: 430 cterms = " ".join(r.field_terms("content")) 431 assert cterms == u"aa aé aú bb cc dd ee" 432 433 a_exp = list(r.expand_prefix("content", "a")) 434 assert a_exp == [b('aa'), AE, AU] 435 436 tset = set(r.all_terms()) 437 assert tset == set([ 438 ('content', b('aa')), ('content', AE), 439 ('content', AU), ('content', b('bb')), 440 ('content', b('cc')), ('content', b('dd')), 441 ('content', b('ee')), ('title', b('document')), 442 ('title', b('my')), ('title', b('other')) 443 ]) 444 445 # (text, doc_freq, index_freq) 446 assert _fstats(r.iter_field("content")) == [ 447 (b('aa'), 2, 6), (AE, 1, 1), (AU, 1, 2), (b('bb'), 2, 5), 448 (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4) 449 ] 450 assert _fstats(r.iter_field("content", prefix="c")) == [ 451 (b('cc'), 2, 3), (b('dd'), 2, 2), (b('ee'), 2, 4) 452 ] 453 454 assert list(r.most_frequent_terms("content")) == [ 455 (6, b('aa')), (5, b('bb')), (4, b('ee')), (3, b('cc')), 456 (2, b('dd')) 457 ] 458 assert list(r.most_frequent_terms("content", prefix="a")) == [ 459 (6, b('aa')), (2, AU), (1, AE) 460 ] 461 assert list(r.most_distinctive_terms("content", 3)) == [ 462 (1.3862943611198906, AU), (0.6931471805599453, AE), (0.0, b('ee')) 463 ] 464 465 466def test_term_inspection_segment_reader(): 467 schema = fields.Schema(title=fields.TEXT(stored=True), 468 content=fields.TEXT) 469 with TempIndex(schema) as ix: 470 with ix.writer() as w: 471 w.add_document( 472 title=u"My document", 473 content=u"AA AA BB BB CC AA AA AA BB BB CC DD EE EE" 474 ) 475 w.add_document( 476 title=u"My other document", 477 content=u"AA AÉ BB CC EE EE Aú AÚ DD" 478 ) 479 480 _check_inspection_results(ix) 481 482 483def test_term_inspection_multi_reader(): 484 schema = fields.Schema(title=fields.TEXT(stored=True), 485 content=fields.TEXT) 486 with TempIndex(schema) as ix: 487 with ix.writer() as w: 488 w.add_document( 489 title=u"My document", 490 content=u"AA AA BB BB CC AA AA AA BB BB CC DD EE EE" 491 ) 492 493 with ix.writer() as w: 494 w.add_document( 495 title=u"My other document", 496 content=u"AA AÉ BB CC EE EE Aú AÚ DD" 497 ) 498 w.merge = False 499 500 _check_inspection_results(ix) 501