1/* $Id$ 2 3 Part of SWI-Prolog 4 5 Author: Jan Wielemaker 6 E-mail: wielemak@science.uva.nl 7 WWW: http://www.swi-prolog.org 8 Copyright (C): 2006, University of Amsterdam 9 10 This program is free software; you can redistribute it and/or 11 modify it under the terms of the GNU General Public License 12 as published by the Free Software Foundation; either version 2 13 of the License, or (at your option) any later version. 14 15 This program is distributed in the hope that it will be useful, 16 but WITHOUT ANY WARRANTY; without even the implied warranty of 17 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 18 GNU General Public License for more details. 19 20 You should have received a copy of the GNU General Public 21 License along with this library; if not, write to the Free Software 22 Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA 23 24 As a special exception, if you link this library with other files, 25 compiled with a Free Software compiler, to produce an executable, this 26 library does not by itself cause the resulting executable to be covered 27 by the GNU General Public License. This exception does not however 28 invalidate any other reasons why the executable file might be covered by 29 the GNU General Public License. 30*/ 31 32 33:- module(rdf_litindex, 34 [ rdf_set_literal_index_option/1, % +Options 35 rdf_tokenize_literal/2, % +Literal, -Tokens 36 rdf_find_literals/2, % +Spec, -ListOfLiterals 37 rdf_token_expansions/2 % +Spec, -Expansions 38 ]). 39:- use_module(rdf_db). 40:- use_module(library(debug)). 41:- use_module(library(lists)). 42:- use_module(library(error)). 43:- use_module(library(porter_stem)). 44:- use_module(library(double_metaphone)). 45 46/** <module> Search literals 47This module finds literals of the RDF database based on stemming and 48being flexible to ordering of tokens. 49*/ 50 51:- dynamic 52 literal_map/2, % Type, -Map 53 new_token/1, % Hook 54 setting/1. 55:- volatile 56 literal_map/2. 57:- multifile 58 tokenization/2, % +Literal, -Tokens 59 exclude_from_index/2. % +Which, +Token 60 61 62setting(verbose(true)). % print progress messages 63setting(index_threads(1)). % # threads for creating the index 64setting(index(default)). % Use a thread for incremental updates 65 66%% rdf_set_literal_index_option(+Options:list) 67% 68% Set options for the literal package. Currently defined options 69% 70% * verbose(Bool) 71% If =true=, print progress messages while building the 72% index tables. 73% 74% * index_threads(+Count) 75% Number of threads to use for initial indexing of 76% literals 77% 78% * index(+How) 79% How to deal with indexing new literals. How is one of 80% =self= (execute in the same thread), thread(N) (execute 81% in N concurrent threads) or =default= (depends on number 82% of cores). 83 84rdf_set_literal_index_option([]) :- !. 85rdf_set_literal_index_option([H|T]) :- !, 86 set_option(H), 87 rdf_set_literal_index_option(T). 88rdf_set_literal_index_option(Option) :- 89 set_option(Option). 90 91set_option(Term) :- 92 check_option(Term), 93 functor(Term, Name, Arity), 94 functor(General, Name, Arity), 95 retractall(setting(General)), 96 assert(setting(Term)). 97 98check_option(X) :- 99 var(X), !, 100 instantiation_error(X). 101check_option(verbose(X)) :- !, 102 must_be(boolean, X). 103check_option(index_threads(Count)) :- !, 104 must_be(nonneg, Count). 105check_option(index(How)) :- !, 106 must_be(oneof([default,thread(_),self]), How). 107check_option(Option) :- 108 domain_error(literal_option, Option). 109 110 111 /******************************* 112 * QUERY * 113 *******************************/ 114 115%% rdf_find_literals(+Spec, -Literals) 116% 117% Find literals in the RDF database matching Spec. Spec is defined 118% as: 119% 120% == 121% Spec ::= and(Spec,Spec) 122% Spec ::= or(Spec,Spec) 123% Spec ::= not(Spec) 124% Spec ::= sounds(Like) 125% Spec ::= stem(Like) 126% Spec ::= prefix(Prefix) 127% Spec ::= between(Low, High) % Numerical between 128% Spec ::= ge(High) % Numerical greater-equal 129% Spec ::= le(Low) % Numerical less-equal 130% Spec ::= Token 131% == 132% 133% sounds(Like) and stem(Like) both map to a disjunction. First we 134% compile the spec to normal form: a disjunction of conjunctions 135% on elementary tokens. Then we execute all the conjunctions and 136% generate the union using ordered-set algorithms. 137% 138% @tbd Exploit ordering of numbers and allow for > N, < N, etc. 139 140rdf_find_literals(Spec, Literals) :- 141 compile_spec(Spec, DNF), 142 token_index(Map), 143 lookup(DNF, Map, _, SuperSet), 144 flatten(SuperSet, Set0), 145 sort(Set0, Literals). 146 147%% rdf_token_expansions(+Spec, -Extensions) 148% 149% Determine which extensions of a token contribute to finding 150% literals. 151 152rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :- 153 token_index(Map), 154 rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens). 155rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :- 156 metaphone_index(Map), 157 rdf_find_literal_map(Map, [Like], Tokens). 158rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :- 159 porter_index(Map), 160 rdf_find_literal_map(Map, [Like], Tokens). 161rdf_token_expansions(Spec, Expansions) :- 162 compile_spec(Spec, DNF), 163 token_index(Map), 164 lookup(DNF, Map, SCS, _), 165 flatten(SCS, CS), 166 sort(CS, Expansions0), 167 join_expansions(Expansions0, Expansions). 168 169join_expansions([], []). 170join_expansions([H0|T0], [H|T]) :- 171 untag(H0, Tag, V0), 172 Tag =.. L0, 173 append(L0, [[V0|Values]], L1), 174 H =.. L1, 175 join_expansions_by_tag(T0, Tag, T1, Values), 176 join_expansions(T1, T). 177 178join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :- 179 untag(H, Tag, V0), !, 180 join_expansions_by_tag(T0, Tag, T, VT). 181join_expansions_by_tag(L, _, L, []). 182 183lookup(@(false), _, [], []) :- !. 184lookup(or(H0,T0), Map, [CH|CT], [H|T]) :- !, 185 lookup(H0, Map, CH, H), 186 lookup(T0, Map, CT, T). 187lookup(H0, Map, [C], [H]) :- 188 lookup1(H0, Map, C, H). 189 190lookup1(Conj, Map, Cond, Literals) :- 191 phrase(conj_to_list(Conj), List), !, 192 rdf_find_literal_map(Map, List, Literals), 193 ( Literals \== [] 194 -> phrase(conj_to_cond(Conj), Cond) 195 ; Cond = [] 196 ). 197lookup1(_, _, _, []). 198 199conj_to_list(and(A,B)) --> !, 200 conj_to_list(A), 201 conj_to_list(B). 202conj_to_list(@(false)) --> !, 203 {fail}. 204conj_to_list(Tagged) --> 205 { untag(Tagged, L) }, !, 206 [L]. 207conj_to_list(L) --> 208 [L]. 209 210 211conj_to_cond(and(A,B)) --> !, 212 conj_to_cond(A), 213 conj_to_cond(B). 214conj_to_cond(Tagged) --> 215 { untag(Tagged, _) }, !, 216 [ Tagged ]. 217conj_to_cond(_) --> 218 []. 219 220 221%% compile_spec(+Spec, -Compiled) 222% 223% Compile a specification as above into disjunctive normal form 224 225compile_spec(Spec, DNF) :- 226 expand_fuzzy(Spec, Spec2), 227 nnf(Spec2, NNF), 228 dnf(NNF, DNF). 229 230 231expand_fuzzy(Var, _) :- 232 var(Var), !, 233 throw(error(instantiation_error, _)). 234expand_fuzzy(sounds(Like), Or) :- !, 235 metaphone_index(Map), 236 double_metaphone(Like, Key), 237 rdf_find_literal_map(Map, [Key], Tokens), 238 list_to_or(Tokens, sounds(Like), Or). 239expand_fuzzy(stem(Like), Or) :- !, 240 porter_index(Map), 241 porter_stem(Like, Key), 242 rdf_find_literal_map(Map, [Key], Tokens), 243 list_to_or(Tokens, stem(Like), Or). 244expand_fuzzy(prefix(Prefix), Or) :- !, 245 token_index(Map), 246 rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens), 247 list_to_or(Tokens, prefix(Prefix), Or). 248expand_fuzzy(case(String), Or) :- !, 249 token_index(Map), 250 rdf_keys_in_literal_map(Map, case(String), Tokens), 251 list_to_or(Tokens, case(String), Or). 252expand_fuzzy(or(A0, B0), E) :- !, 253 expand_fuzzy(A0, A), 254 expand_fuzzy(B0, B), 255 simplify(or(A,B), E). 256expand_fuzzy(and(A0, B0), E) :- !, 257 expand_fuzzy(A0, A), 258 expand_fuzzy(B0, B), 259 simplify(and(A,B), E). 260expand_fuzzy(not(A0), not(A)) :- !, 261 expand_fuzzy(A0, A). 262expand_fuzzy(between(Low, High), Or) :- !, 263 token_index(Map), 264 rdf_keys_in_literal_map(Map, between(Low, High), Tokens), 265 list_to_or(Tokens, between(Low, High), Or). 266expand_fuzzy(le(High), Or) :- !, 267 token_index(Map), 268 rdf_keys_in_literal_map(Map, le(High), Tokens), 269 list_to_or(Tokens, le(High), Or). 270expand_fuzzy(ge(Low), Or) :- !, 271 token_index(Map), 272 rdf_keys_in_literal_map(Map, ge(Low), Tokens), 273 list_to_or(Tokens, ge(Low), Or). 274expand_fuzzy(Token, Token) :- 275 atomic(Token), !. 276expand_fuzzy(Token, _) :- 277 throw(error(type_error(Token, boolean_expression), _)). 278 279simplify(Expr0, Expr) :- 280 simple(Expr0, Expr), !. 281simplify(Expr, Expr). 282 283simple(and(@(false), _), @(false)). 284simple(and(_, @(false)), @(false)). 285simple(or(@(false), X), X). 286simple(or(X, @(false)), X). 287 288 289list_to_or([], _, @(false)) :- !. 290list_to_or([X], How, One) :- !, 291 tag(How, X, One). 292list_to_or([H0|T0], How, or(H, T)) :- 293 tag(How, H0, H), 294 list_to_or(T0, How, T). 295 296tag(sounds(X), Y, sounds(X,Y)). 297tag(stem(X), Y, stem(X,Y)). 298tag(prefix(X), Y, prefix(X,Y)). 299tag(case(X), Y, case(X,Y)). 300tag(between(L,H), Y, between(L,H,Y)). 301tag(ge(L), Y, ge(L,Y)). 302tag(le(H), Y, le(H,Y)). 303 304untag(sounds(_,Y), Y). 305untag(stem(_,Y), Y). 306untag(prefix(_,Y), Y). 307untag(case(_,Y), Y). 308untag(between(_,_,Y), Y). 309untag(le(_,Y), Y). 310untag(ge(_,Y), Y). 311 312untag(sounds(X,Y), sounds(X), Y). 313untag(stem(X,Y), stem(X), Y). 314untag(prefix(X,Y), prefix(X), Y). 315untag(case(X,Y), case(X), Y). 316untag(between(L,H,Y), between(L,H), Y). 317untag(ge(L,Y), ge(L), Y). 318untag(le(H,Y), le(H), Y). 319 320 321%% nnf(+Formula, -NNF) 322% 323% Rewrite to Negative Normal Form, meaning negations only appear 324% around literals. 325 326nnf(not(not(A0)), A) :- !, 327 nnf(A0, A). 328nnf(not(and(A0,B0)), or(A,B)) :- !, 329 nnf(not(A0), A), 330 nnf(not(B0), B). 331nnf(not(or(A0,B0)), and(A,B)) :- !, 332 nnf(not(A0), A), 333 nnf(not(B0), B). 334nnf(A, A). 335 336 337%% dnf(+NNF, -DNF) 338% 339% Convert a formula in NNF to Disjunctive Normal Form (DNF) 340 341dnf(or(A0,B0), or(A, B)) :- !, 342 dnf(A0, A), 343 dnf(B0, B). 344dnf(and(A0,B0), DNF):- !, 345 dnf(A0, A1), 346 dnf(B0, B1), 347 dnf1(and(A1,B1), DNF). 348dnf(DNF, DNF). 349 350dnf1(and(A0, or(B,C)), or(P,Q)) :- !, 351 dnf1(and(A0,B), P), 352 dnf1(and(A0,C), Q). 353dnf1(and(or(B,C), A0), or(P,Q)) :- !, 354 dnf1(and(A0,B), P), 355 dnf1(and(A0,C), Q). 356dnf1(DNF, DNF). 357 358 359 /******************************* 360 * TOKEN INDEX * 361 *******************************/ 362 363%% token_index(-Map) 364% 365% Get the index of tokens. If not present, create one from the 366% current database. Once created, the map is kept up-to-date using 367% a monitor hook. 368 369token_index(Map) :- 370 literal_map(tokens, Map), !. 371token_index(Map) :- 372 rdf_new_literal_map(Map), 373 assert(literal_map(tokens, Map)), 374 make_literal_index, 375 verbose('~N', []), 376 Monitor = [ reset, 377 new_literal, 378 old_literal 379 ], 380 ( setting(index(default)) 381 -> ( current_prolog_flag(cpu_count, N), N > 1 382 -> create_update_literal_thread(1), 383 rdf_monitor(thread_monitor_literal, Monitor) 384 ; rdf_monitor(monitor_literal, Monitor) 385 ) 386 ; setting(index(thread(N))) 387 -> create_update_literal_thread(N), 388 rdf_monitor(thread_monitor_literal, Monitor) 389 ; rdf_monitor(monitor_literal, Monitor) 390 ). 391 392 393%% make_literal_index 394% 395% Create the initial literal index. 396 397make_literal_index :- 398 setting(index_threads(N)), !, 399 threaded_literal_index(N). 400make_literal_index :- 401 current_prolog_flag(cpu_count, X), 402 threaded_literal_index(X). 403 404threaded_literal_index(N) :- 405 N > 1, !, 406 message_queue_create(Q, [max_size(1000)]), 407 create_index_threads(N, Q, Ids), 408 forall(rdf_current_literal(Literal), 409 thread_send_message(Q, Literal)), 410 forall(between(1, N, _), 411 thread_send_message(Q, done(true))), 412 maplist(thread_join, Ids, _). 413threaded_literal_index(_) :- 414 forall(rdf_current_literal(Literal), 415 register_literal(Literal)). 416 417create_index_threads(N, Q, [Id|T]) :- 418 N > 0, !, 419 thread_create(index_worker(Q), Id, 420 [ local(1000), 421 global(1000), 422 trail(1000) 423 ]), 424 N2 is N - 1, 425 create_index_threads(N2, Q, T). 426create_index_threads(_, _, []) :- !. 427 428index_worker(Queue) :- 429 repeat, 430 thread_get_message(Queue, Msg), 431 work(Msg). 432 433work(done(true)) :- !. 434work(Literal) :- 435 register_literal(Literal), 436 fail. 437 438 439% clean_token_index 440% 441% Clean after a reset. 442 443clean_token_index :- 444 forall(literal_map(_, Map), 445 rdf_reset_literal_map(Map)). 446 447 /******************************* 448 * THREADED UPDATE * 449 *******************************/ 450 451% create_update_literal_thread(+Threads) 452% 453% Setup literal monitoring using threads. While loading databases 454% through rdf_attach_db/2 from rdf_persistency.pl, most of the 455% time is spent updating the literal token database. While loading 456% the RDF triples, most of the time is spend in updating the AVL 457% tree holding the literals. Updating the token index hangs on 458% updating the AVL trees holding the tokens. Both tasks however 459% can run concurrently. 460 461create_update_literal_thread(Threads) :- 462 message_queue_create(_, 463 [ alias(rdf_literal_monitor_queue), 464 max_size(10000) 465 ]), 466 forall(between(1, Threads, N), 467 ( atom_concat(rdf_literal_monitor_, N, Alias), 468 thread_create(monitor_literals, _, 469 [ alias(Alias), 470 local(1000), 471 global(1000), 472 trail(1000) 473 ]) 474 )). 475 476monitor_literals :- 477 set_prolog_flag(agc_margin, 0), % we don't create garbage 478 repeat, 479 thread_get_message(rdf_literal_monitor_queue, Literal), 480 register_literal(Literal), 481 fail. 482 483thread_monitor_literal(new_literal(Literal)) :- !, 484 thread_send_message(rdf_literal_monitor_queue, Literal). 485thread_monitor_literal(Action) :- !, 486 monitor_literal(Action). 487 488 489 /******************************* 490 * MONITORED UPDATE * 491 *******************************/ 492 493monitor_literal(new_literal(Literal)) :- 494 register_literal(Literal). 495monitor_literal(old_literal(Literal)) :- 496 unregister_literal(Literal). 497monitor_literal(transaction(begin, reset)) :- 498 rdf_monitor(monitor_literal, [-old_literal]), 499 clean_token_index. 500monitor_literal(transaction(end, reset)) :- 501 rdf_monitor(monitor_literal, [+old_literal]). 502 503%% register_literal(+Literal) 504% 505% Associate the tokens of a literal with the literal itself. 506 507register_literal(Literal) :- 508 ( rdf_tokenize_literal(Literal, Tokens) 509 -> text_of(Literal, Text), 510 literal_map(tokens, Map), 511 add_tokens(Tokens, Text, Map) 512 ; true 513 ). 514 515add_tokens([], _, _). 516add_tokens([H|T], Literal, Map) :- 517 rdf_insert_literal_map(Map, H, Literal, Keys), 518 ( var(Keys) 519 -> true 520 ; forall(new_token(H), true), 521 ( Keys mod 1000 =:= 0 522 -> progress(Map, 'Tokens') 523 ; true 524 ) 525 ), 526 add_tokens(T, Literal, Map). 527 528 529%% unregister_literal(+Literal) 530% 531% Literal is removed from the database. As we abstract from lang 532% and type qualifiers we first have to check this is the last one 533% that is destroyed. 534 535unregister_literal(Literal) :- 536 text_of(Literal, Text), 537 ( rdf(_,_,literal(Text)) 538 -> true % still something left 539 ; rdf_tokenize_literal(Literal, Tokens), 540 literal_map(tokens, Map), 541 del_tokens(Tokens, Text, Map) 542 ). 543 544del_tokens([], _, _). 545del_tokens([H|T], Literal, Map) :- 546 rdf_delete_literal_map(Map, H, Literal), 547 del_tokens(T, Literal, Map). 548 549 550%% rdf_tokenize_literal(+Literal, -Tokens) is semidet. 551% 552% Tokenize a literal. We make this hookable as tokenization is 553% generally domain dependent. 554 555rdf_tokenize_literal(Literal, Tokens) :- 556 tokenization(Literal, Tokens), !. % Hook 557rdf_tokenize_literal(Literal, Tokens) :- 558 text_of(Literal, Text), 559 atom(Text), 560 tokenize_atom(Text, Tokens0), 561 select_tokens(Tokens0, Tokens). 562 563select_tokens([], []). 564select_tokens([H|T0], T) :- 565 ( exclude_from_index(token, H) 566 -> select_tokens(T0, T) 567 ; number(H) 568 -> ( integer(H), 569 between(-1073741824, 1073741823, H) 570 -> T = [H|T1], 571 select_tokens(T0, T1) 572 ; select_tokens(T0, T) 573 ) 574 ; atom_length(H, 1) 575 -> select_tokens(T0, T) 576 ; no_index_token(H) 577 -> select_tokens(T0, T) 578 ; T = [H|T1], 579 select_tokens(T0, T1) 580 ). 581 582 583% no_index_token/1 584% 585% Tokens we do not wish to index, as they creat huge amounts of 586% data with little or no value. Is there a more general way to 587% describe this? Experience shows that simply word count is not a 588% good criterium as it often rules out popular domain terms. 589 590no_index_token(and). 591no_index_token(an). 592no_index_token(or). 593no_index_token(of). 594no_index_token(on). 595no_index_token(in). 596no_index_token(this). 597no_index_token(the). 598 599 600%% text_of(+LiteralArg, -Text) 601% 602% Get the textual or (integer) numerical information from a 603% literal value. 604 605text_of(type(_, Text), Text) :- !. 606text_of(lang(_, Text), Text) :- !. 607text_of(Text, Text) :- atom(Text), !. 608text_of(Text, Text) :- integer(Text). 609 610 611 /******************************* 612 * PORTER INDEX * 613 *******************************/ 614 615 616porter_index(Map) :- 617 literal_map(porter, Map), !. 618porter_index(Map) :- 619 rdf_new_literal_map(Map), 620 assert(literal_map(porter, Map)), 621 fill_porter_index(Map), 622 assert((new_token(Token) :- add_stem(Token, Map))). 623 624fill_porter_index(PorterMap) :- 625 token_index(TokenMap), 626 rdf_keys_in_literal_map(TokenMap, all, Tokens), 627 stem(Tokens, PorterMap). 628 629stem([], _). 630stem([Token|T], Map) :- 631 ( atom(Token) 632 -> porter_stem(Token, Stem), 633 rdf_insert_literal_map(Map, Stem, Token, Keys), 634 ( integer(Keys), 635 Keys mod 1000 =:= 0 636 -> progress(Map, 'Porter') 637 ; true 638 ) 639 ; true 640 ), 641 stem(T, Map). 642 643 644add_stem(Token, Map) :- 645 porter_stem(Token, Stem), 646 rdf_insert_literal_map(Map, Stem, Token, _). 647 648 649 /******************************* 650 * METAPHONE INDEX * 651 *******************************/ 652 653 654metaphone_index(Map) :- 655 literal_map(metaphone, Map), !. 656metaphone_index(Map) :- 657 rdf_new_literal_map(Map), 658 assert(literal_map(metaphone, Map)), 659 fill_metaphone_index(Map), 660 assert((new_token(Token) :- add_metaphone(Token, Map))). 661 662fill_metaphone_index(PorterMap) :- 663 token_index(TokenMap), 664 rdf_keys_in_literal_map(TokenMap, all, Tokens), 665 metaphone(Tokens, PorterMap). 666 667metaphone([], _). 668metaphone([Token|T], Map) :- 669 ( atom(Token) 670 -> double_metaphone(Token, SoundEx), 671 rdf_insert_literal_map(Map, SoundEx, Token, Keys), 672 ( integer(Keys), 673 Keys mod 1000 =:= 0 674 -> progress(Map, 'Metaphone') 675 ; true 676 ) 677 ; true 678 ), 679 metaphone(T, Map). 680 681 682add_metaphone(Token, Map) :- 683 double_metaphone(Token, SoundEx), 684 rdf_insert_literal_map(Map, SoundEx, Token). 685 686 687 /******************************* 688 * UTIL * 689 *******************************/ 690 691verbose(Fmt, Args) :- 692 setting(verbose(true)), !, 693 format(user_error, Fmt, Args). 694verbose(_, _). 695 696progress(Map, Which) :- 697 setting(verbose(true)), !, 698 rdf_statistics_literal_map(Map, size(Keys, Values)), 699 format(user_error, 700 '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+', 701 [Which, Keys, Values]). 702progress(_,_). 703