1/*  $Id$
2
3    Part of SWI-Prolog
4
5    Author:        Jan Wielemaker
6    E-mail:        J.Wielemaker@uva.nl
7    WWW:           http://www.swi-prolog.org
8    Copyright (C): 2008, University of Amsterdam
9
10    This program is free software; you can redistribute it and/or
11    modify it under the terms of the GNU General Public License
12    as published by the Free Software Foundation; either version 2
13    of the License, or (at your option) any later version.
14
15    This program is distributed in the hope that it will be useful,
16    but WITHOUT ANY WARRANTY; without even the implied warranty of
17    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18    GNU General Public License for more details.
19
20    You should have received a copy of the GNU Lesser General Public
21    License along with this library; if not, write to the Free Software
22    Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
23
24    As a special exception, if you link this library with other files,
25    compiled with a Free Software compiler, to produce an executable, this
26    library does not by itself cause the resulting executable to be covered
27    by the GNU General Public License. This exception does not however
28    invalidate any other reasons why the executable file might be covered by
29    the GNU General Public License.
30*/
31
32
33:- module(http_open,
34	  [ http_open/3,		% +URL, -Stream, +Options
35	    http_set_authorization/2	% +URL, +Authorization
36	  ]).
37
38:- use_module(library(url)).
39:- use_module(library(readutil)).
40:- use_module(library(socket)).
41:- use_module(library(lists)).
42:- use_module(library(option)).
43:- use_module(library(error)).
44:- use_module(library(base64)).
45:- use_module(library(debug)).
46
47:- expects_dialect(swi).
48:- assert(system:swi_io).
49
50user_agent('SWI-Prolog <http://www.swi-prolog.org>').
51
52/** <module> Simple HTTP client
53
54This library provides a light-weight HTTP client library to get the data
55from a URL. The functionality of the  library can be extended by loading
56two additional modules that acts as plugins:
57
58    * library(http/http_chunked)
59    Loading this library causes http_open/3 to support chunked
60    transfer encoding.
61
62    * library(http/http_header)
63    Loading this library causes http_open/3 to support the =POST= method
64    in addition to =GET= and =HEAD=.
65
66Here is a simple example to fetch a web-page:
67
68  ==
69  ?- http_open('http://www.google.com/search?q=prolog', In, []),
70     copy_stream_data(In, user_output),
71     close(In).
72  <!doctype html><head><title>prolog - Google Search</title><script>
73  ...
74  ==
75
76The example below fetches the modification time of a web-page. Note that
77Modified is '' if the web-server does not provide a time-stamp for the
78resource. See also parse_time/2.
79
80  ==
81  modified(URL, Stamp) :-
82	  http_open(URL, In,
83		    [ method(head),
84		      header(last_modified, Modified)
85		    ]),
86	  close(In),
87	  Modified \== '',
88	  parse_time(Modified, Stamp).
89  close(In).
90  ==
91
92@see xpath/3
93@see http_get/3
94@see http_post/4
95*/
96
97:- multifile
98	http:encoding_filter/3,		  % +Encoding, +In0,  -In
99	http:current_transfer_encoding/1, % ?Encoding
100	http:http_protocol_hook/7.	  % +Protocol, +Parts, +In, +Out,
101					  % -NewIn, -NewOut, +Options
102
103
104%%	http_open(+URL, -Stream, +Options) is det.
105%
106%	Open the data at the HTTP  server   as  a  Prolog stream. URL is
107%	either an atom  specifying  a  URL   or  a  list  representing a
108%	broken-down URL compatible to parse_url/2.  After this predicate
109%	succeeds the data can be read from Stream. After completion this
110%	stream must be  closed  using   the  built-in  Prolog  predicate
111%	close/1. Options provides additional options:
112%
113%	  * authorization(+Term)
114%	  Send authorization.  Currently only supports basic(User,Password).
115%	  See also http_set_authorization/2.
116%
117%	  * final_url(-FinalURL)
118%	  Unify FinalURL} with the final  destination. This differs from
119%	  the  original  URL  if  the  returned  head  of  the  original
120%	  indicates an HTTP redirect (codes 301,  302 or 303). Without a
121%	  redirect, FinalURL is unified with   the  canonical version of
122%	  URL using:
123%
124%	      ==
125%	      parse_url(URL, Parts),
126%	      parse_url(FinalURL, Parts)
127%	      ==
128%
129%	  * header(Name, -AtomValue)
130%	  If provided, AtomValue is  unified  with   the  value  of  the
131%	  indicated  field  in  the  reply    header.  Name  is  matched
132%	  case-insensitive and the underscore  (_)   matches  the hyphen
133%	  (-). Multiple of these options  may   be  provided  to extract
134%	  multiple  header  fields.  If  the  header  is  not  available
135%	  AtomValue is unified to the empty atom ('').
136%
137%	  * method(+Method)
138%	  One of =get= (default) or =head=.   The  =head= message can be
139%	  used in combination with  the   header(Name,  Value) option to
140%	  access information on the resource   without actually fetching
141%	  the resource itself.  The  returned   stream  must  be  closed
142%	  immediately.   If   library(http/http_header)     is   loaded,
143%	  http_open/3 also supports =post=. See the post(Data) option.
144%
145%	  * size(-Size)
146%	  Size is unified with the   integer value of =|Content-Length|=
147%	  in the reply header.
148%
149%	  * timeout(+Timeout)
150%	  If provided, set a timeout on   the stream using set_stream/2.
151%	  With this option if no new data arrives within Timeout seconds
152%	  the stream raises an exception.  Default   is  to wait forever
153%	  (=infinite=).
154%
155%	  * post(+Data)
156%	  Provided if library(http/http_header) is also loaded.  Data is
157%	  handed to http_post_data/3.
158%
159%	  * proxy(+Host, +Port)
160%	  Use an HTTP proxy to connect to the outside world.
161%
162%	  * proxy_authorization(+Authorization)
163%	  Send authorization to the proxy.  Otherwise   the  same as the
164%	  =authorization= option.
165%
166%	  * request_header(Name = Value)
167%	  Additional  name-value  parts  are  added   in  the  order  of
168%	  appearance to the HTTP request   header.  No interpretation is
169%	  done.
170%
171%	  * user_agent(+Agent)
172%	  Defines the value of the  =|User-Agent|=   field  of  the HTTP
173%	  header. Default is =|SWI-Prolog (http://www.swi-prolog.org)|=.
174%
175%	@error	existence_error(url, Id)
176
177http_open(URL, Stream, Options) :-
178	atom(URL), !,
179	parse_url_ex(URL, Parts),
180	add_authorization(URL, Options, Options1),
181	http_open(Parts, Stream, Options1).
182http_open(Parts, Stream, Options0) :-
183	memberchk(proxy(Host, ProxyPort), Options0), !,
184	parse_url_ex(Location, Parts),
185	Options = [visited(Parts)|Options0],
186	open_socket(Host:ProxyPort, In, Out, Options),
187        option(protocol(Protocol), Parts, http),
188	default_port(Protocol, DefPort),
189	option(port(Port), Parts, DefPort),
190	host_and_port(Host, DefPort, Port, HostPort),
191	add_authorization(Parts, Options, Options1),
192	send_rec_header(Out, In, Stream, HostPort, Location, Parts, Options1),
193	return_final_url(Options).
194http_open(Parts, Stream, Options0) :-
195	memberchk(host(Host), Parts),
196        option(protocol(Protocol), Parts, http),
197	default_port(Protocol, DefPort),
198	option(port(Port), Parts, DefPort),
199	http_location(Parts, Location),
200	Options = [visited(Parts)|Options0],
201	open_socket(Host:Port, SocketIn, SocketOut, Options),
202        (   http:http_protocol_hook(Protocol, Parts,
203				    SocketIn, SocketOut,
204				    In, Out, Options)
205        ->  true
206        ;   In = SocketIn,
207            Out = SocketOut
208        ),
209	host_and_port(Host, DefPort, Port, HostPort),
210	add_authorization(Parts, Options, Options1),
211	send_rec_header(Out, In, Stream, HostPort, Location, Parts, Options1),
212	return_final_url(Options).
213
214http:http_protocol_hook(http, _, In, Out, In, Out, _).
215
216default_port(https, 443) :- !.
217default_port(_,	    80).
218
219host_and_port(Host, DefPort, DefPort, Host) :- !.
220host_and_port(Host, _,       Port,    Host:Port).
221
222%%	send_rec_header(+Out, +In, -InStream,
223%%			+Host, +Location, +Parts, +Options) is det.
224%
225%	Send header to Out and process reply.  If there is an error or
226%	failure, close In and Out and return the error or failure.
227
228send_rec_header(Out, In, Stream, Host, Location, Parts, Options) :-
229	(   catch(guarded_send_rec_header(Out, In, Stream,
230					  Host, Location, Parts, Options),
231		  E, true)
232	->  (   var(E)
233	    ->	close(Out)
234	    ;	force_close(In, Out),
235		throw(E)
236	    )
237	;   force_close(In, Out),
238	    fail
239	).
240
241guarded_send_rec_header(Out, In, Stream, Host, Location, Parts, Options) :-
242	user_agent(Agent, Options),
243	method(Options, MNAME),
244	http_version(Version),
245	format(Out,
246	       '~w ~w HTTP/~w\r\n\
247	       Host: ~w\r\n\
248	       User-Agent: ~w\r\n\
249	       Connection: close\r\n',
250	       [MNAME, Location, Version, Host, Agent]),
251	x_headers(Options, Out),
252        (   option(post(PostData), Options)
253        ->  http_header:http_post_data(PostData, Out, [])
254        ;   format(Out, '\r\n', [])
255        ),
256	flush_output(Out),
257					% read the reply header
258	read_header(In, Code, Comment, Lines),
259	do_open(Code, Comment, Lines, Options, Parts, In, Stream).
260
261
262%%	http_version(-Version:atom) is det.
263%
264%	HTTP version we publish. We  can  only   use  1.1  if we support
265%	chunked encoding, which means http_chunked.pl must be loaded.
266
267http_version('1.1') :-
268	http:current_transfer_encoding(chunked), !.
269http_version('1.0').
270
271force_close(S1, S2) :-
272	close(S1, [force(true)]),
273	close(S2, [force(true)]).
274
275method(Options, MNAME) :-
276	option(post(_), Options), !,
277	option(method(M), Options, post),
278	(   map_method(M, MNAME0)
279	->  MNAME = MNAME0
280	;   domain_error(method, M)
281	).
282method(Options, MNAME) :-
283	option(method(M), Options, get),
284	(   map_method(M, MNAME0)
285	->  MNAME = MNAME0
286	;   domain_error(method, M)
287	).
288
289map_method(get,  'GET').
290map_method(head, 'HEAD').
291map_method(post, 'POST') :-
292	current_predicate(http_header:http_post_data/3).
293
294
295%%	x_headers(+Options, +Out) is det.
296%
297%	Emit extra headers from   request_header(Name=Value)  options in
298%	Options.
299
300x_headers([], _).
301x_headers([H|T], Out) :- !,
302	x_header(H, Out),
303	x_headers(T, Out).
304
305x_header(request_header(Name=Value), Out) :- !,
306	format(Out, '~w: ~w\r\n', [Name, Value]).
307x_header(proxy_authorization(ProxyAuthorization), Out) :- !,
308	auth_header(ProxyAuthorization, 'Proxy-Authorization', Out).
309x_header(authorization(Authorization), Out) :- !,
310	auth_header(Authorization, 'Authorization', Out).
311x_header(_, _).
312
313auth_header(basic(User, Password), Header, Out) :- !,
314	format(codes(Codes), '~w:~w', [User, Password]),
315	phrase(base64(Codes), Base64Codes),
316	format(Out, '~w: basic ~s\r\n', [Header, Base64Codes]).
317auth_header(Auth, _, _) :-
318	domain_error(authorization, Auth).
319
320user_agent(Agent, Options) :-
321	(   option(user_agent(Agent), Options)
322	->  true
323	;   user_agent(Agent)
324	).
325
326%%	do_open(+HTTPStatusCode, +HTTPStatusComment, +Header,
327%%		+Options, +Parts, +In, -FinalIn) is det.
328%
329%	Handle the HTTP status. If 200, we   are ok. If a redirect, redo
330%	the open, returning a new stream. Else issue an error.
331%
332%	@error	existence_error(url, URL)
333
334do_open(200, _, Lines, Options, Parts, In0, In) :- !,
335	return_size(Options, Lines),
336	return_fields(Options, Lines),
337	transfer_encoding_filter(Lines, In0, In),
338					% properly re-initialise the stream
339	parse_url_ex(Id, Parts),
340	set_stream(In, file_name(Id)),
341	set_stream(In, record_position(true)).
342					% Handle redirections
343do_open(Code, _, Lines, Options, Parts, In, Stream) :-
344	redirect_code(Code),
345	location(Lines, Location), !,
346	debug(http(redirect), 'http_open: redirecting to ~w', [Location]),
347	parse_url_ex(Location, Parts, Redirected),
348	close(In),
349	http_open(Redirected, Stream, [visited(Redirected)|Options]).
350					% report anything else as error
351do_open(Code, Comment, _,  _, Parts, _, _) :-
352	parse_url_ex(Id, Parts),
353	(   map_error_code(Code, Error)
354	->  Formal =.. [Error, url, Id]
355	;   Formal = existence_error(url, Id)
356	),
357	throw(error(Formal, context(_, status(Code, Comment)))).
358
359%%	map_error_code(+HTTPCode, -PrologError) is semidet.
360%
361%	Map HTTP error codes to Prolog errors.
362%
363%	@tbd	Many more maps. Unfortunately many have no sensible Prolog
364%		counterpart.
365
366map_error_code(401, permission_error).
367map_error_code(403, permission_error).
368map_error_code(404, existence_error).
369map_error_code(405, permission_error).
370map_error_code(407, permission_error).
371map_error_code(410, existence_error).
372
373redirect_code(301).			% moved permanently
374redirect_code(302).			% moved temporary
375redirect_code(303).			% see also
376
377%%	open_socket(+Address, -In, -Out, +Options) is det.
378%
379%	Create and connect a client socket to Address.  Options
380%
381%	    * timeout(+Timeout)
382%	    Sets timeout on the stream, *after* connecting the
383%	    socket.
384%
385%	@tbd	Make timeout also work on tcp_connect/4.
386%	@tbd	This is the same as do_connect/4 in http_client.pl
387
388open_socket(Address, In, Out, Options) :-
389	debug(http(open), 'http_open: Connecting to ~p ...', [Address]),
390	tcp_socket(Socket),
391	catch(tcp_connect(Socket, Address, In, Out),
392	      E,
393	      (	  tcp_close_socket(Socket),
394		  throw(E)
395	      )),
396	debug(http(open), '\tok ~p --> ~p', [In, Out]),
397	set_stream(In, record_position(false)),
398	(   memberchk(Options, timeout(Timeout))
399	->  set_stream(In, timeout(Timeout))
400	;   true
401	).
402
403
404return_size(Options, Lines) :-
405	memberchk(size(Size), Options), !,
406	content_length(Lines, Size).
407return_size(_, _).
408
409return_fields([], _).
410return_fields([header(Name, Value)|T], Lines) :- !,
411	atom_codes(Name, Codes),
412	(   member(Line, Lines),
413	    phrase(atom_field(Codes, Value), Line)
414	->  true
415	;   Value = ''
416	),
417	return_fields(T, Lines).
418return_fields([_|T], Lines) :-
419	return_fields(T, Lines).
420
421
422%%	return_final_url(+Options) is semidet.
423%
424%	If Options contains final_url(URL), unify URL with the final
425%	URL after redirections.
426
427return_final_url(Options) :-
428	memberchk(final_url(URL), Options),
429	var(URL), !,
430	memberchk(visited(Parts), Options),
431	parse_url_ex(URL, Parts).
432return_final_url(_).
433
434
435%%	transfer_encoding_filter(+Lines, +In0, -In) is det.
436%
437%	Install filters depending on the encoding.
438
439transfer_encoding_filter(Lines, In0, In) :-
440	transfer_encoding(Lines, Encoding), !,
441	(   http:encoding_filter(Encoding, In0, In)
442	->  true
443	;   	domain_error(http_encoding, Encoding)
444	).
445transfer_encoding_filter(_, In, In).
446
447
448%%	transfer_encoding(+Lines, -Encoding) is semidet.
449%
450%	True if Encoding is the value of the =|Transfer-encoding|=
451%	header.
452
453transfer_encoding(Lines, Encoding) :-
454	member(Line, Lines),
455	phrase(transfer_encoding(Encoding0), Line), !,
456	debug(http(transfer_encoding), 'Transfer-encoding: ~w', [Encoding0]),
457	Encoding = Encoding0.
458
459transfer_encoding(Encoding) -->
460	field("transfer-encoding"),
461	rest(Encoding).
462
463%%	read_header(+In:stream, -Code:int, -Comment:atom, -Lines:list)
464%
465%	Read the HTTP reply-header.
466%
467%	@param Code	Numeric HTTP reply-code
468%	@param Comment	Comment of reply-code as atom
469%	@param Lines	Remaining header lines as code-lists.
470
471read_header(In, Code, Comment, Lines) :-
472	read_line_to_codes(In, Line),
473	phrase(first_line(Code, Comment), Line),
474	read_line_to_codes(In, Line2),
475	rest_header(Line2, In, Lines).
476
477rest_header("", _, []) :- !.		% blank line: end of header
478rest_header(L0, In, [L0|L]) :-
479	read_line_to_codes(In, L1),
480	rest_header(L1, In, L).
481
482%%	content_length(+Header, -Length:int) is semidet.
483%
484%	Find the Content-Length in an HTTP reply-header.
485
486content_length(Lines, Length) :-
487	member(Line, Lines),
488	phrase(content_length(Length0), Line), !,
489	Length = Length0.
490
491location(Lines, Location) :-
492	member(Line, Lines),
493	phrase(atom_field("location", Location), Line), !.
494
495first_line(Code, Comment) -->
496	"HTTP/", [_], ".", [_],
497	skip_blanks,
498	integer(Code),
499	skip_blanks,
500	rest(Comment).
501
502atom_field(Name, Value) -->
503	field(Name),
504	rest(Value).
505
506content_length(Len) -->
507	field("content-length"),
508	integer(Len).
509
510field([]) -->
511	":",
512	skip_blanks.
513field([H|T]) -->
514	[C],
515	{ match_header_char(H, C)
516	},
517	field(T).
518
519match_header_char(C, C) :- !.
520match_header_char(C, U) :-
521	code_type(C, to_lower(U)), !.
522match_header_char(0'_, 0'-).
523
524
525skip_blanks -->
526	[C],
527	{ code_type(C, white)
528	}, !,
529	skip_blanks.
530skip_blanks -->
531	[].
532
533%%	integer(-Int)//
534%
535%	Read 1 or more digits and return as integer.
536
537integer(Code) -->
538	digit(D0),
539	digits(D),
540	{ number_codes(Code, [D0|D])
541	}.
542
543digit(C) -->
544	[C],
545	{ code_type(C, digit)
546	}.
547
548digits([D0|D]) -->
549	digit(D0), !,
550	digits(D).
551digits([]) -->
552	[].
553
554%%	rest(-Atom:atom)//
555%
556%	Get rest of input as an atom.
557
558rest(A,L,[]) :-
559	atom_codes(A, L).
560
561
562		 /*******************************
563		 *   AUTHORIZATION MANAGEMENT	*
564		 *******************************/
565
566%%	http_set_authorization(+URL, +Authorization) is det.
567%
568%	Set user/password to supply with URLs   that have URL as prefix.
569%	If  Authorization  is  the   atom    =|-|=,   possibly   defined
570%	authorization is cleared.  For example:
571%
572%	==
573%	?- http_set_authorization('http://www.example.com/private/',
574%				  basic('John', 'Secret'))
575%	==
576%
577%	@tbd	Move to a separate module, so http_get/3, etc. can use this
578%		too.
579
580:- dynamic
581	stored_authorization/2,
582	cached_authorization/2.
583
584http_set_authorization(URL, Authorization) :-
585	must_be(atom, URL),
586	retractall(stored_authorization(URL, _)),
587	(   Authorization = (-)
588	->  true
589	;   check_authorization(Authorization),
590	    assert(stored_authorization(URL, Authorization))
591	),
592	retractall(cached_authorization(_,_)).
593
594check_authorization(Var) :-
595	var(Var), !,
596	instantiation_error(Var).
597check_authorization(basic(User, Password)) :-
598	must_be(atom, User),
599	must_be(atom, Password).
600
601%%	authorization(+URL, -Authorization) is semdet.
602%
603%	True if Authorization must be supplied for URL.
604%
605%	@tbd	Cleanup cache if it gets too big.
606
607authorization(_, _) :-
608	\+ stored_authorization(_, _), !,
609	fail.
610authorization(URL, Authorization) :-
611	cached_authorization(URL, Authorization), !,
612	Authorization \== (-).
613authorization(URL, Authorization) :-
614	(   stored_authorization(Prefix, Authorization),
615	    sub_atom(URL, 0, _, _, Prefix)
616	->  assert(cached_authorization(URL, Authorization))
617	;   assert(cached_authorization(URL, -)),
618	    fail
619	).
620
621add_authorization(_, Options, Options) :-
622	option(authorization(_), Options), !.
623add_authorization(For, Options0, Options) :-
624	stored_authorization(_, _) ->	% quick test to avoid work
625	(   atom(For)
626	->  URL = For
627	;   parse_url_ex(URL, For)
628	),
629	authorization(URL, Auth), !,
630	Options = [authorization(Auth)|Options0].
631add_authorization(_, Options, Options).
632
633
634parse_url_ex(URL, Parts) :-
635	parse_url(URL, Parts), !.
636parse_url_ex(URL, _) :-
637	domain_error(url, URL).		% Syntax error?
638
639parse_url_ex(URL, RelativeTo, Parts) :-
640	parse_url(URL, RelativeTo, Parts), !.
641parse_url_ex(URL, _, _) :-
642	domain_error(url, URL).		% Syntax error?
643
644:- retract(system:swi_io).
645