1/*  $Id$
2
3    Part of SWI-Prolog
4
5    Author:        Jan Wielemaker
6    E-mail:        wielemak@science.uva.nl
7    WWW:           http://www.swi-prolog.org
8    Copyright (C): 2006, University of Amsterdam
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License
12    as published by the Free Software Foundation; either version 2
13    of the License, or (at your option) any later version.
14
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19
20    You should have received a copy of the GNU General Public
21    License along with this library; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
24    As a special exception, if you link this library with other files,
25    compiled with a Free Software compiler, to produce an executable, this
26    library does not by itself cause the resulting executable to be covered
27    by the GNU General Public License. This exception does not however
28    invalidate any other reasons why the executable file might be covered by
29    the GNU General Public License.
30*/
31
32
33:- module(rdf_litindex,
34	  [ rdf_set_literal_index_option/1,	% +Options
35	    rdf_tokenize_literal/2,		% +Literal, -Tokens
36	    rdf_find_literals/2,		% +Spec, -ListOfLiterals
37	    rdf_token_expansions/2		% +Spec, -Expansions
38	  ]).
39:- use_module(rdf_db).
40:- use_module(library(debug)).
41:- use_module(library(lists)).
42:- use_module(library(error)).
43:- use_module(library(porter_stem)).
44:- use_module(library(double_metaphone)).
45
46/** <module> Search literals
47This module finds literals of the RDF database based on stemming and
48being flexible to ordering of tokens.
49*/
50
51:- dynamic
52	literal_map/2,			% Type, -Map
53	new_token/1,			% Hook
54	setting/1.
55:- volatile
56	literal_map/2.
57:- multifile
58	tokenization/2,			% +Literal, -Tokens
59	exclude_from_index/2.		% +Which, +Token
60
61
62setting(verbose(true)).			% print progress messages
63setting(index_threads(1)).		% # threads for creating the index
64setting(index(default)).		% Use a thread for incremental updates
65
66%%	rdf_set_literal_index_option(+Options:list)
67%
68%	Set options for the literal package.  Currently defined options
69%
70%		* verbose(Bool)
71%		If =true=, print progress messages while building the
72%		index tables.
73%
74%		* index_threads(+Count)
75%		Number of threads to use for initial indexing of
76%		literals
77%
78%		* index(+How)
79%		How to deal with indexing new literals.  How is one of
80%		=self= (execute in the same thread), thread(N) (execute
81%		in N concurrent threads) or =default= (depends on number
82%		of cores).
83
84rdf_set_literal_index_option([]) :- !.
85rdf_set_literal_index_option([H|T]) :- !,
86	set_option(H),
87	rdf_set_literal_index_option(T).
88rdf_set_literal_index_option(Option) :-
89	set_option(Option).
90
91set_option(Term) :-
92	check_option(Term),
93	functor(Term, Name, Arity),
94	functor(General, Name, Arity),
95	retractall(setting(General)),
96	assert(setting(Term)).
97
98check_option(X) :-
99	var(X), !,
100	instantiation_error(X).
101check_option(verbose(X)) :- !,
102	must_be(boolean, X).
103check_option(index_threads(Count)) :- !,
104	must_be(nonneg, Count).
105check_option(index(How)) :- !,
106	must_be(oneof([default,thread(_),self]), How).
107check_option(Option) :-
108	domain_error(literal_option, Option).
109
110
111		 /*******************************
112		 *	      QUERY		*
113		 *******************************/
114
115%%	rdf_find_literals(+Spec, -Literals)
116%
117%	Find literals in the RDF database matching Spec.  Spec is defined
118%	as:
119%
120%	==
121%	Spec ::= and(Spec,Spec)
122%	Spec ::= or(Spec,Spec)
123%	Spec ::= not(Spec)
124%	Spec ::= sounds(Like)
125%	Spec ::= stem(Like)
126%	Spec ::= prefix(Prefix)
127%	Spec ::= between(Low, High)	% Numerical between
128%	Spec ::= ge(High)		% Numerical greater-equal
129%	Spec ::= le(Low)		% Numerical less-equal
130%	Spec ::= Token
131%	==
132%
133%	sounds(Like) and stem(Like) both map to  a disjunction. First we
134%	compile the spec to normal form:   a disjunction of conjunctions
135%	on elementary tokens. Then we execute   all the conjunctions and
136%	generate the union using ordered-set algorithms.
137%
138%	@tbd Exploit ordering of numbers and allow for > N, < N, etc.
139
140rdf_find_literals(Spec, Literals) :-
141	compile_spec(Spec, DNF),
142	token_index(Map),
143	lookup(DNF, Map, _, SuperSet),
144	flatten(SuperSet, Set0),
145	sort(Set0, Literals).
146
147%%	rdf_token_expansions(+Spec, -Extensions)
148%
149%	Determine which extensions of  a   token  contribute  to finding
150%	literals.
151
152rdf_token_expansions(prefix(Prefix), [prefix(Prefix, Tokens)]) :-
153	token_index(Map),
154	rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens).
155rdf_token_expansions(sounds(Like), [sounds(Like, Tokens)]) :-
156	metaphone_index(Map),
157	rdf_find_literal_map(Map, [Like], Tokens).
158rdf_token_expansions(stem(Like), [stem(Like, Tokens)]) :-
159	porter_index(Map),
160	rdf_find_literal_map(Map, [Like], Tokens).
161rdf_token_expansions(Spec, Expansions) :-
162	compile_spec(Spec, DNF),
163	token_index(Map),
164	lookup(DNF, Map, SCS, _),
165	flatten(SCS, CS),
166	sort(CS, Expansions0),
167	join_expansions(Expansions0, Expansions).
168
169join_expansions([], []).
170join_expansions([H0|T0], [H|T]) :-
171	untag(H0, Tag, V0),
172	Tag =.. L0,
173	append(L0, [[V0|Values]], L1),
174	H =.. L1,
175	join_expansions_by_tag(T0, Tag, T1, Values),
176	join_expansions(T1, T).
177
178join_expansions_by_tag([H|T0], Tag, T, [V0|VT]) :-
179	untag(H, Tag, V0), !,
180	join_expansions_by_tag(T0, Tag, T, VT).
181join_expansions_by_tag(L, _, L, []).
182
183lookup(@(false), _, [], []) :- !.
184lookup(or(H0,T0), Map, [CH|CT], [H|T]) :- !,
185	lookup(H0, Map, CH, H),
186	lookup(T0, Map, CT, T).
187lookup(H0, Map, [C], [H]) :-
188	lookup1(H0, Map, C, H).
189
190lookup1(Conj, Map, Cond, Literals) :-
191	phrase(conj_to_list(Conj), List), !,
192	rdf_find_literal_map(Map, List, Literals),
193	(   Literals \== []
194	->  phrase(conj_to_cond(Conj), Cond)
195	;   Cond = []
196	).
197lookup1(_, _, _, []).
198
199conj_to_list(and(A,B)) --> !,
200	conj_to_list(A),
201	conj_to_list(B).
202conj_to_list(@(false)) --> !,
203	{fail}.
204conj_to_list(Tagged) -->
205	{ untag(Tagged, L) }, !,
206	[L].
207conj_to_list(L) -->
208	[L].
209
210
211conj_to_cond(and(A,B)) --> !,
212	conj_to_cond(A),
213	conj_to_cond(B).
214conj_to_cond(Tagged) -->
215	{ untag(Tagged, _) }, !,
216	[ Tagged ].
217conj_to_cond(_) -->
218	[].
219
220
221%%	compile_spec(+Spec, -Compiled)
222%
223%	Compile a specification as above into disjunctive normal form
224
225compile_spec(Spec, DNF) :-
226	expand_fuzzy(Spec, Spec2),
227	nnf(Spec2, NNF),
228	dnf(NNF, DNF).
229
230
231expand_fuzzy(Var, _) :-
232	var(Var), !,
233	throw(error(instantiation_error, _)).
234expand_fuzzy(sounds(Like), Or) :- !,
235	metaphone_index(Map),
236	double_metaphone(Like, Key),
237	rdf_find_literal_map(Map, [Key], Tokens),
238	list_to_or(Tokens, sounds(Like), Or).
239expand_fuzzy(stem(Like), Or) :- !,
240	porter_index(Map),
241	porter_stem(Like, Key),
242	rdf_find_literal_map(Map, [Key], Tokens),
243	list_to_or(Tokens, stem(Like), Or).
244expand_fuzzy(prefix(Prefix), Or) :- !,
245	token_index(Map),
246	rdf_keys_in_literal_map(Map, prefix(Prefix), Tokens),
247	list_to_or(Tokens, prefix(Prefix), Or).
248expand_fuzzy(case(String), Or) :- !,
249	token_index(Map),
250	rdf_keys_in_literal_map(Map, case(String), Tokens),
251	list_to_or(Tokens, case(String), Or).
252expand_fuzzy(or(A0, B0), E) :- !,
253	expand_fuzzy(A0, A),
254	expand_fuzzy(B0, B),
255	simplify(or(A,B), E).
256expand_fuzzy(and(A0, B0), E) :- !,
257	expand_fuzzy(A0, A),
258	expand_fuzzy(B0, B),
259	simplify(and(A,B), E).
260expand_fuzzy(not(A0), not(A)) :- !,
261	expand_fuzzy(A0, A).
262expand_fuzzy(between(Low, High), Or) :- !,
263	token_index(Map),
264	rdf_keys_in_literal_map(Map, between(Low, High), Tokens),
265	list_to_or(Tokens, between(Low, High), Or).
266expand_fuzzy(le(High), Or) :- !,
267	token_index(Map),
268	rdf_keys_in_literal_map(Map, le(High), Tokens),
269	list_to_or(Tokens, le(High), Or).
270expand_fuzzy(ge(Low), Or) :- !,
271	token_index(Map),
272	rdf_keys_in_literal_map(Map, ge(Low), Tokens),
273	list_to_or(Tokens, ge(Low), Or).
274expand_fuzzy(Token, Token) :-
275	atomic(Token), !.
276expand_fuzzy(Token, _) :-
277	throw(error(type_error(Token, boolean_expression), _)).
278
279simplify(Expr0, Expr) :-
280	simple(Expr0, Expr), !.
281simplify(Expr, Expr).
282
283simple(and(@(false), _), @(false)).
284simple(and(_, @(false)), @(false)).
285simple(or(@(false), X), X).
286simple(or(X, @(false)), X).
287
288
289list_to_or([], _, @(false)) :- !.
290list_to_or([X], How, One) :- !,
291	tag(How, X, One).
292list_to_or([H0|T0], How, or(H, T)) :-
293	tag(How, H0, H),
294	list_to_or(T0, How, T).
295
296tag(sounds(X),	  Y, sounds(X,Y)).
297tag(stem(X),	  Y, stem(X,Y)).
298tag(prefix(X),	  Y, prefix(X,Y)).
299tag(case(X),	  Y, case(X,Y)).
300tag(between(L,H), Y, between(L,H,Y)).
301tag(ge(L),	  Y, ge(L,Y)).
302tag(le(H),	  Y, le(H,Y)).
303
304untag(sounds(_,Y),    Y).
305untag(stem(_,Y),      Y).
306untag(prefix(_,Y),    Y).
307untag(case(_,Y),      Y).
308untag(between(_,_,Y), Y).
309untag(le(_,Y),	      Y).
310untag(ge(_,Y),	      Y).
311
312untag(sounds(X,Y),    sounds(X),    Y).
313untag(stem(X,Y),      stem(X),	    Y).
314untag(prefix(X,Y),    prefix(X),    Y).
315untag(case(X,Y),      case(X),	    Y).
316untag(between(L,H,Y), between(L,H), Y).
317untag(ge(L,Y),	      ge(L),	    Y).
318untag(le(H,Y),	      le(H),	    Y).
319
320
321%%	nnf(+Formula, -NNF)
322%
323%	Rewrite to Negative Normal Form, meaning negations only appear
324%	around literals.
325
326nnf(not(not(A0)), A) :- !,
327	nnf(A0, A).
328nnf(not(and(A0,B0)), or(A,B)) :- !,
329	nnf(not(A0), A),
330	nnf(not(B0), B).
331nnf(not(or(A0,B0)), and(A,B)) :- !,
332	nnf(not(A0), A),
333	nnf(not(B0), B).
334nnf(A, A).
335
336
337%%	dnf(+NNF, -DNF)
338%
339%	Convert a formula in NNF to Disjunctive Normal Form (DNF)
340
341dnf(or(A0,B0), or(A, B)) :- !,
342	dnf(A0, A),
343	dnf(B0, B).
344dnf(and(A0,B0), DNF):- !,
345	dnf(A0, A1),
346	dnf(B0, B1),
347	dnf1(and(A1,B1), DNF).
348dnf(DNF, DNF).
349
350dnf1(and(A0, or(B,C)), or(P,Q)) :- !,
351	dnf1(and(A0,B), P),
352	dnf1(and(A0,C), Q).
353dnf1(and(or(B,C), A0), or(P,Q)) :- !,
354	dnf1(and(A0,B), P),
355	dnf1(and(A0,C), Q).
356dnf1(DNF, DNF).
357
358
359		 /*******************************
360		 *	    TOKEN INDEX		*
361		 *******************************/
362
363%%	token_index(-Map)
364%
365%	Get the index of tokens. If  not   present,  create one from the
366%	current database. Once created, the map is kept up-to-date using
367%	a monitor hook.
368
369token_index(Map) :-
370	literal_map(tokens, Map), !.
371token_index(Map) :-
372	rdf_new_literal_map(Map),
373	assert(literal_map(tokens, Map)),
374	make_literal_index,
375	verbose('~N', []),
376	Monitor = [ reset,
377		    new_literal,
378		    old_literal
379		  ],
380	(   setting(index(default))
381	->  (   current_prolog_flag(cpu_count, N), N > 1
382	    ->	create_update_literal_thread(1),
383		rdf_monitor(thread_monitor_literal, Monitor)
384	    ;	rdf_monitor(monitor_literal, Monitor)
385	    )
386	;   setting(index(thread(N)))
387	->  create_update_literal_thread(N),
388	    rdf_monitor(thread_monitor_literal, Monitor)
389	;   rdf_monitor(monitor_literal, Monitor)
390	).
391
392
393%%	make_literal_index
394%
395%	Create the initial literal index.
396
397make_literal_index :-
398	setting(index_threads(N)), !,
399	threaded_literal_index(N).
400make_literal_index :-
401	current_prolog_flag(cpu_count, X),
402	threaded_literal_index(X).
403
404threaded_literal_index(N) :-
405	N > 1, !,
406	message_queue_create(Q, [max_size(1000)]),
407	create_index_threads(N, Q, Ids),
408	forall(rdf_current_literal(Literal),
409	       thread_send_message(Q, Literal)),
410	forall(between(1, N, _),
411	       thread_send_message(Q, done(true))),
412	maplist(thread_join, Ids, _).
413threaded_literal_index(_) :-
414	forall(rdf_current_literal(Literal),
415	       register_literal(Literal)).
416
417create_index_threads(N, Q, [Id|T]) :-
418	N > 0, !,
419	thread_create(index_worker(Q), Id,
420		      [ local(1000),
421			global(1000),
422			trail(1000)
423		      ]),
424	N2 is N - 1,
425	create_index_threads(N2, Q, T).
426create_index_threads(_, _, []) :- !.
427
428index_worker(Queue) :-
429	repeat,
430	    thread_get_message(Queue, Msg),
431	    work(Msg).
432
433work(done(true)) :- !.
434work(Literal) :-
435	register_literal(Literal),
436	fail.
437
438
439%	clean_token_index
440%
441%	Clean after a reset.
442
443clean_token_index :-
444	forall(literal_map(_, Map),
445	       rdf_reset_literal_map(Map)).
446
447		 /*******************************
448		 *	  THREADED UPDATE	*
449		 *******************************/
450
451%	create_update_literal_thread(+Threads)
452%
453%	Setup literal monitoring using threads.  While loading databases
454%	through rdf_attach_db/2 from  rdf_persistency.pl,   most  of the
455%	time is spent updating the literal token database. While loading
456%	the RDF triples, most of the time   is spend in updating the AVL
457%	tree holding the literals. Updating  the   token  index hangs on
458%	updating the AVL trees holding the   tokens.  Both tasks however
459%	can run concurrently.
460
461create_update_literal_thread(Threads) :-
462	message_queue_create(_,
463			     [ alias(rdf_literal_monitor_queue),
464			       max_size(10000)
465			     ]),
466	forall(between(1, Threads, N),
467	       (   atom_concat(rdf_literal_monitor_, N, Alias),
468		   thread_create(monitor_literals, _,
469				 [ alias(Alias),
470				   local(1000),
471				   global(1000),
472				   trail(1000)
473				 ])
474	       )).
475
476monitor_literals :-
477	set_prolog_flag(agc_margin, 0),	% we don't create garbage
478	repeat,
479	    thread_get_message(rdf_literal_monitor_queue, Literal),
480	    register_literal(Literal),
481	fail.
482
483thread_monitor_literal(new_literal(Literal)) :- !,
484	thread_send_message(rdf_literal_monitor_queue, Literal).
485thread_monitor_literal(Action) :- !,
486	monitor_literal(Action).
487
488
489		 /*******************************
490		 *	 MONITORED UPDATE	*
491		 *******************************/
492
493monitor_literal(new_literal(Literal)) :-
494	register_literal(Literal).
495monitor_literal(old_literal(Literal)) :-
496	unregister_literal(Literal).
497monitor_literal(transaction(begin, reset)) :-
498	rdf_monitor(monitor_literal, [-old_literal]),
499	clean_token_index.
500monitor_literal(transaction(end, reset)) :-
501	rdf_monitor(monitor_literal, [+old_literal]).
502
503%%	register_literal(+Literal)
504%
505%	Associate the tokens of a literal with the literal itself.
506
507register_literal(Literal) :-
508	(   rdf_tokenize_literal(Literal, Tokens)
509	->  text_of(Literal, Text),
510	    literal_map(tokens, Map),
511	    add_tokens(Tokens, Text, Map)
512	;   true
513	).
514
515add_tokens([], _, _).
516add_tokens([H|T], Literal, Map) :-
517	rdf_insert_literal_map(Map, H, Literal, Keys),
518	(   var(Keys)
519	->  true
520	;   forall(new_token(H), true),
521	    (	Keys mod 1000 =:= 0
522	    ->	progress(Map, 'Tokens')
523	    ;	true
524	    )
525	),
526	add_tokens(T, Literal, Map).
527
528
529%%	unregister_literal(+Literal)
530%
531%	Literal is removed from the database.   As we abstract from lang
532%	and type qualifiers we first have to  check this is the last one
533%	that is destroyed.
534
535unregister_literal(Literal) :-
536	text_of(Literal, Text),
537	(   rdf(_,_,literal(Text))
538	->  true			% still something left
539	;   rdf_tokenize_literal(Literal, Tokens),
540	    literal_map(tokens, Map),
541	    del_tokens(Tokens, Text, Map)
542	).
543
544del_tokens([], _, _).
545del_tokens([H|T], Literal, Map) :-
546	rdf_delete_literal_map(Map, H, Literal),
547	del_tokens(T, Literal, Map).
548
549
550%%	rdf_tokenize_literal(+Literal, -Tokens) is semidet.
551%
552%	Tokenize a literal. We make  this   hookable  as tokenization is
553%	generally domain dependent.
554
555rdf_tokenize_literal(Literal, Tokens) :-
556	tokenization(Literal, Tokens), !. 		% Hook
557rdf_tokenize_literal(Literal, Tokens) :-
558	text_of(Literal, Text),
559	atom(Text),
560	tokenize_atom(Text, Tokens0),
561	select_tokens(Tokens0, Tokens).
562
563select_tokens([], []).
564select_tokens([H|T0], T) :-
565	(   exclude_from_index(token, H)
566	->  select_tokens(T0, T)
567	;   number(H)
568	->  (   integer(H),
569	        between(-1073741824, 1073741823, H)
570	    ->	T = [H|T1],
571		select_tokens(T0, T1)
572	    ;   select_tokens(T0, T)
573	    )
574	;   atom_length(H, 1)
575	->  select_tokens(T0, T)
576	;   no_index_token(H)
577	->  select_tokens(T0, T)
578	;   T = [H|T1],
579	    select_tokens(T0, T1)
580	).
581
582
583%	no_index_token/1
584%
585%	Tokens we do not wish to index,   as  they creat huge amounts of
586%	data with little or no value.  Is   there  a more general way to
587%	describe this? Experience shows that simply  word count is not a
588%	good criterium as it often rules out popular domain terms.
589
590no_index_token(and).
591no_index_token(an).
592no_index_token(or).
593no_index_token(of).
594no_index_token(on).
595no_index_token(in).
596no_index_token(this).
597no_index_token(the).
598
599
600%%	text_of(+LiteralArg, -Text)
601%
602%	Get the textual  or  (integer)   numerical  information  from  a
603%	literal value.
604
605text_of(type(_, Text), Text) :- !.
606text_of(lang(_, Text), Text) :- !.
607text_of(Text, Text) :- atom(Text), !.
608text_of(Text, Text) :- integer(Text).
609
610
611		 /*******************************
612		 *	   PORTER INDEX		*
613		 *******************************/
614
615
616porter_index(Map) :-
617	literal_map(porter, Map), !.
618porter_index(Map) :-
619	rdf_new_literal_map(Map),
620	assert(literal_map(porter, Map)),
621	fill_porter_index(Map),
622	assert((new_token(Token) :- add_stem(Token, Map))).
623
624fill_porter_index(PorterMap) :-
625	token_index(TokenMap),
626	rdf_keys_in_literal_map(TokenMap, all, Tokens),
627	stem(Tokens, PorterMap).
628
629stem([], _).
630stem([Token|T], Map) :-
631	(   atom(Token)
632	->  porter_stem(Token, Stem),
633	    rdf_insert_literal_map(Map, Stem, Token, Keys),
634	    (	integer(Keys),
635		Keys mod 1000 =:= 0
636	    ->  progress(Map, 'Porter')
637	    ;	true
638	    )
639	;   true
640	),
641	stem(T, Map).
642
643
644add_stem(Token, Map) :-
645	porter_stem(Token, Stem),
646	rdf_insert_literal_map(Map, Stem, Token, _).
647
648
649		 /*******************************
650		 *	  METAPHONE INDEX	*
651		 *******************************/
652
653
654metaphone_index(Map) :-
655	literal_map(metaphone, Map), !.
656metaphone_index(Map) :-
657	rdf_new_literal_map(Map),
658	assert(literal_map(metaphone, Map)),
659	fill_metaphone_index(Map),
660	assert((new_token(Token) :- add_metaphone(Token, Map))).
661
662fill_metaphone_index(PorterMap) :-
663	token_index(TokenMap),
664	rdf_keys_in_literal_map(TokenMap, all, Tokens),
665	metaphone(Tokens, PorterMap).
666
667metaphone([], _).
668metaphone([Token|T], Map) :-
669	(   atom(Token)
670	->  double_metaphone(Token, SoundEx),
671	    rdf_insert_literal_map(Map, SoundEx, Token, Keys),
672	    (	integer(Keys),
673		Keys mod 1000 =:= 0
674	    ->	progress(Map, 'Metaphone')
675	    ;	true
676	    )
677	;   true
678	),
679	metaphone(T, Map).
680
681
682add_metaphone(Token, Map) :-
683	double_metaphone(Token, SoundEx),
684	rdf_insert_literal_map(Map, SoundEx, Token).
685
686
687		 /*******************************
688		 *	       UTIL		*
689		 *******************************/
690
691verbose(Fmt, Args) :-
692	setting(verbose(true)), !,
693	format(user_error, Fmt, Args).
694verbose(_, _).
695
696progress(Map, Which) :-
697	setting(verbose(true)), !,
698	rdf_statistics_literal_map(Map, size(Keys, Values)),
699	format(user_error,
700	       '\r~t~w: ~12|Keys: ~t~D~15+; Values: ~t~D~20+',
701	       [Which, Keys, Values]).
702progress(_,_).
703