1 /* -*- Mode: c; c-basic-offset: 2 -*-
2  *
3  * raptor_www.c - Raptor WWW retrieval core
4  *
5  * Copyright (C) 2003-2008, David Beckett http://www.dajobe.org/
6  * Copyright (C) 2003-2005, University of Bristol, UK http://www.bristol.ac.uk/
7  *
8  * This package is Free Software and part of Redland http://librdf.org/
9  *
10  * It is licensed under the following three licenses as alternatives:
11  *   1. GNU Lesser General Public License (LGPL) V2.1 or any newer version
12  *   2. GNU General Public License (GPL) V2 or any newer version
13  *   3. Apache License, V2.0 or any newer version
14  *
15  * You may not use this file except in compliance with at least one of
16  * the above three licenses.
17  *
18  * See LICENSE.html or LICENSE.txt at the top of this package for the
19  * complete terms and further detail along with the license texts for
20  * the licenses in COPYING.LIB, COPYING and LICENSE-2.0.txt respectively.
21  *
22  *
23  */
24 
25 
26 #ifdef HAVE_CONFIG_H
27 #include <raptor_config.h>
28 #endif
29 
30 #ifdef WIN32
31 #include <win32_raptor_config.h>
32 #endif
33 
34 #include <stdio.h>
35 #include <string.h>
36 #include <stdarg.h>
37 #ifdef HAVE_ERRNO_H
38 #include <errno.h>
39 #endif
40 #ifdef HAVE_SYS_STAT_H
41 #include <sys/stat.h>
42 #endif
43 
44 /* Raptor includes */
45 #include "raptor.h"
46 #include "raptor_internal.h"
47 
48 
49 static int raptor_www_init_common(int skip_www_init_finish, int *www_initialized);
50 static void raptor_www_finish_common(int skip_www_init_finish);
51 static int raptor_www_file_fetch(raptor_www* www);
52 
53 
54 
55 #ifndef RAPTOR_DISABLE_V1
56 /* should raptor_www do initializing and cleanup of the WWW library */
57 static int raptor_www_skip_www_init_finish=0;
58 static int raptor_www_initialized=0;
59 #endif
60 
61 
62 #ifndef RAPTOR_DISABLE_V1
63 /**
64  * raptor_www_init:
65  *
66  * Initialise the WWW class.
67  *
68  * Must be called before creating any #raptor_www object.
69  *
70  * See also: raptor_www_init_v2()
71  **/
72 void
raptor_www_init(void)73 raptor_www_init(void)
74 {
75   raptor_www_init_common(raptor_www_skip_www_init_finish, &raptor_www_initialized);
76 }
77 #endif
78 
79 
80 /**
81  * raptor_www_init_v2:
82  * @world: raptor_world object
83  *
84  * Initialise the WWW class.
85  *
86  * Must be called before creating any #raptor_www object.
87  *
88  * See also: raptor_www_init()
89  *
90  * Return value: non-0 on failure
91  **/
92 int
raptor_www_init_v2(raptor_world * world)93 raptor_www_init_v2(raptor_world* world)
94 {
95 #ifndef RAPTOR_DISABLE_V1
96   /* support legacy v1 raptor_www_no_www_library_init_finish() */
97   if(raptor_www_skip_www_init_finish)
98     world->www_skip_www_init_finish = raptor_www_skip_www_init_finish;
99 
100   /* skip init if already inited with legacy init() */
101   if(raptor_www_initialized)
102     return 0;
103 #endif
104 
105   return raptor_www_init_common(world->www_skip_www_init_finish, &world->www_initialized);
106 }
107 
108 
109 static int
raptor_www_init_common(int skip_www_init_finish,int * www_initialized)110 raptor_www_init_common(int skip_www_init_finish, int *www_initialized)
111 {
112   int rc = 0;
113 
114   if(*www_initialized)
115     return 0;
116 
117   if(!skip_www_init_finish) {
118 #ifdef RAPTOR_WWW_LIBCURL
119     rc = curl_global_init(CURL_GLOBAL_ALL);
120 #endif
121   }
122 
123   *www_initialized = 1;
124   return rc;
125 }
126 
127 
128 #ifndef RAPTOR_DISABLE_V1
129 /**
130  * raptor_www_no_www_library_init_finish:
131  *
132  * Do not initialise or finish the lower level WWW library.
133  *
134  * If this is called then the raptor_www library will neither
135  * initialise or terminate the lower level WWW library.  Usually in
136  * raptor_init either curl_global_init (for libcurl)
137  * are called and in raptor_finish curl_global_cleanup is called.
138  *
139  * This allows the application finer control over these libraries such
140  * as setting other global options or potentially calling and terminating
141  * raptor several times.  It does mean that applications which use
142  * this call must do their own extra work in order to allocate and free
143  * all resources to the system.
144  *
145  * This function must be called before raptor_init.
146  *
147  * See also: raptor_www_no_www_library_init_finish_v2()
148  *
149  **/
150 void
raptor_www_no_www_library_init_finish(void)151 raptor_www_no_www_library_init_finish(void)
152 {
153   raptor_www_skip_www_init_finish = 1;
154 }
155 #endif
156 
157 
158 /**
159  * raptor_www_no_www_library_init_finish_v2:
160  * @world: raptor_world object
161  *
162  * Do not initialise or finish the lower level WWW library.
163  *
164  * If this is called then the raptor_www library will neither
165  * initialise or terminate the lower level WWW library.  Usually in
166  * raptor_world_open() either curl_global_init (for libcurl)
167  * are called and in raptor_finish curl_global_cleanup is called.
168  *
169  * This allows the application finer control over these libraries such
170  * as setting other global options or potentially calling and terminating
171  * raptor several times.  It does mean that applications which use
172  * this call must do their own extra work in order to allocate and free
173  * all resources to the system.
174  *
175  * This function must be called before raptor_world_open().
176  *
177  **/
178 void
raptor_www_no_www_library_init_finish_v2(raptor_world * world)179 raptor_www_no_www_library_init_finish_v2(raptor_world* world)
180 {
181   world->www_skip_www_init_finish = 1;
182 }
183 
184 
185 #ifndef RAPTOR_DISABLE_V1
186 /**
187  * raptor_www_finish:
188  *
189  * Terminate the WWW class.
190  *
191  * Must be called to clean any resources used by the WWW implementation.
192  *
193  * See also: raptor_www_finish_v2()
194  **/
195 void
raptor_www_finish(void)196 raptor_www_finish(void)
197 {
198   raptor_www_finish_common(raptor_www_skip_www_init_finish);
199 }
200 #endif
201 
202 
203 /**
204  * raptor_www_finish_v2:
205  * @world: raptor_world object
206  *
207  * Terminate the WWW class.
208  *
209  * Must be called to clean any resources used by the WWW implementation.
210  *
211  * See also: raptor_www_finish()
212  **/
213 void
raptor_www_finish_v2(raptor_world * world)214 raptor_www_finish_v2(raptor_world* world)
215 {
216   raptor_www_finish_common(world->www_skip_www_init_finish);
217 }
218 
219 
220 static void
raptor_www_finish_common(int skip_www_init_finish)221 raptor_www_finish_common(int skip_www_init_finish)
222 {
223   if(!skip_www_init_finish) {
224 #ifdef RAPTOR_WWW_LIBCURL
225     curl_global_cleanup();
226 #endif
227   }
228 }
229 
230 
231 #ifndef RAPTOR_DISABLE_V1
232 /**
233  * raptor_www_new_with_connection:
234  * @connection: external WWW connection object.
235  *
236  * Constructor - create a new #raptor_www object over an existing WWW connection.
237  *
238  * At present this only works with a libcurl CURL handle object
239  * when raptor is compiled with libcurl suppport. Otherwise the
240  * @connection is ignored.  This allows such things as setting
241  * up special flags on the curl handle before passing into the constructor.
242  *
243  * raptor_init() MUST have been called before calling this function.
244  * Use raptor_www_new_with_connection_v2() if using raptor_world APIs.
245  *
246  * Return value: a new #raptor_www object or NULL on failure.
247  **/
248 raptor_www*
raptor_www_new_with_connection(void * connection)249 raptor_www_new_with_connection(void *connection)
250 {
251   return raptor_www_new_with_connection_v2(raptor_world_instance(), connection);
252 }
253 #endif
254 
255 
256 /**
257  * raptor_www_new_with_connection_v2:
258  * @world: raptor_world object
259  * @connection: external WWW connection object.
260  *
261  * Constructor - create a new #raptor_www object over an existing WWW connection.
262  *
263  * At present this only works with a libcurl CURL handle object
264  * when raptor is compiled with libcurl suppport. Otherwise the
265  * @connection is ignored.  This allows such things as setting
266  * up special flags on the curl handle before passing into the constructor.
267  *
268  * Return value: a new #raptor_www object or NULL on failure.
269  **/
270 raptor_www*
raptor_www_new_with_connection_v2(raptor_world * world,void * connection)271 raptor_www_new_with_connection_v2(raptor_world* world, void *connection)
272 {
273   raptor_www* www=(raptor_www* )RAPTOR_CALLOC(www, 1, sizeof(raptor_www));
274   if(!www)
275     return NULL;
276 
277   www->world=world;
278   www->type=NULL;
279   www->free_type=1; /* default is to free content type */
280   www->total_bytes=0;
281   www->failed=0;
282   www->status_code=0;
283   www->write_bytes=NULL;
284   www->content_type=NULL;
285   www->uri_filter=NULL;
286   www->connection_timeout=10;
287   www->cache_control=NULL;
288 
289 #ifdef RAPTOR_WWW_LIBCURL
290   www->curl_handle=(CURL*)connection;
291   raptor_www_curl_init(www);
292 #endif
293 #ifdef RAPTOR_WWW_LIBXML
294   raptor_www_libxml_init(www);
295 #endif
296 #ifdef RAPTOR_WWW_LIBFETCH
297   raptor_www_libfetch_init(www);
298 #endif
299 
300   www->error_handlers.locator=&www->locator;
301   raptor_error_handlers_init_v2(world, &www->error_handlers);
302 
303   return www;
304 }
305 
306 
307 #ifndef RAPTOR_DISABLE_V1
308 /**
309  * raptor_www_new:
310  *
311  * Constructor - create a new #raptor_www object.
312  *
313  * raptor_init() MUST have been called before calling this function.
314  * Use raptor_www_new_v2() if using raptor_world APIs.
315  *
316  * Return value: a new #raptor_www or NULL on failure.
317  **/
318 raptor_www*
raptor_www_new(void)319 raptor_www_new(void)
320 {
321   return raptor_www_new_v2(raptor_world_instance());
322 }
323 #endif
324 
325 
326 /**
327  * raptor_www_new_v2:
328  * @world: raptor_world object
329  *
330  * Constructor - create a new #raptor_www object.
331  *
332  * Return value: a new #raptor_www or NULL on failure.
333  **/
334 raptor_www*
raptor_www_new_v2(raptor_world * world)335 raptor_www_new_v2(raptor_world* world)
336 {
337   return raptor_www_new_with_connection_v2(world, NULL);
338 }
339 
340 
341 /**
342  * raptor_www_free:
343  * @www: WWW object.
344  *
345  * Destructor - destroy a #raptor_www object.
346  **/
347 void
raptor_www_free(raptor_www * www)348 raptor_www_free(raptor_www* www)
349 {
350   /* free context */
351   if(www->type) {
352     if(www->free_type)
353       RAPTOR_FREE(cstring, www->type);
354     www->type=NULL;
355   }
356 
357   if(www->user_agent) {
358     RAPTOR_FREE(cstring, www->user_agent);
359     www->user_agent=NULL;
360   }
361 
362   if(www->cache_control) {
363     RAPTOR_FREE(cstring, www->cache_control);
364     www->cache_control=NULL;
365   }
366 
367   if(www->proxy) {
368     RAPTOR_FREE(cstring, www->proxy);
369     www->proxy=NULL;
370   }
371 
372   if(www->http_accept) {
373     RAPTOR_FREE(cstring, www->http_accept);
374     www->http_accept=NULL;
375   }
376 
377 #ifdef RAPTOR_WWW_LIBCURL
378   raptor_www_curl_free(www);
379 #endif
380 #ifdef RAPTOR_WWW_LIBXML
381   raptor_www_libxml_free(www);
382 #endif
383 #ifdef RAPTOR_WWW_LIBFETCH
384   raptor_www_libfetch_free(www);
385 #endif
386 
387   if(www->uri)
388     raptor_free_uri_v2(www->world, www->uri);
389 
390   if(www->final_uri)
391     raptor_free_uri_v2(www->world, www->final_uri);
392 
393   RAPTOR_FREE(www, www);
394 }
395 
396 
397 
398 /**
399  * raptor_www_set_error_handler:
400  * @www: WWW object
401  * @error_handler: error handler function
402  * @error_data: error handler data
403  *
404  * Set the error handler routine for the raptor_www class.
405  *
406  * This takes the same arguments as the raptor_parser_set_error() and
407  * raptor_parser_set_warning_handler() methods.
408  **/
409 void
raptor_www_set_error_handler(raptor_www * www,raptor_message_handler error_handler,void * error_data)410 raptor_www_set_error_handler(raptor_www* www,
411                              raptor_message_handler error_handler,
412                              void *error_data)
413 {
414   www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].user_data=error_data;
415   www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].handler=error_handler;
416 }
417 
418 
419 /**
420  * raptor_www_set_write_bytes_handler:
421  * @www: WWW object
422  * @handler: bytes handler function
423  * @user_data: bytes handler data
424  *
425  * Set the handler to receive bytes written by the #raptor_www implementation.
426  *
427  **/
428 void
raptor_www_set_write_bytes_handler(raptor_www * www,raptor_www_write_bytes_handler handler,void * user_data)429 raptor_www_set_write_bytes_handler(raptor_www* www,
430                                    raptor_www_write_bytes_handler handler,
431                                    void *user_data)
432 {
433   www->write_bytes=handler;
434   www->write_bytes_userdata=user_data;
435 }
436 
437 
438 /**
439  * raptor_www_set_content_type_handler:
440  * @www: WWW object
441  * @handler: content type handler function
442  * @user_data: content type handler data
443  *
444  * Set the handler to receive the HTTP Content-Type header value.
445  *
446  * This is called if or when the value is discovered during retrieval
447  * by the raptor_www implementation.  Not all implementations provide
448  * access to this.
449  **/
450 void
raptor_www_set_content_type_handler(raptor_www * www,raptor_www_content_type_handler handler,void * user_data)451 raptor_www_set_content_type_handler(raptor_www* www,
452                                     raptor_www_content_type_handler handler,
453                                     void *user_data)
454 {
455   www->content_type=handler;
456   www->content_type_userdata=user_data;
457 }
458 
459 
460 /**
461  * raptor_www_set_user_agent:
462  * @www: WWW object
463  * @user_agent: User-Agent string
464  *
465  * Set the user agent value, for HTTP requests typically.
466  **/
467 void
raptor_www_set_user_agent(raptor_www * www,const char * user_agent)468 raptor_www_set_user_agent(raptor_www* www, const char *user_agent)
469 {
470   char *ua_copy=NULL;
471 
472   if(!user_agent || !*user_agent) {
473     www->user_agent=NULL;
474     return;
475   }
476 
477   ua_copy=(char*)RAPTOR_MALLOC(cstring, strlen(user_agent)+1);
478   if(!ua_copy)
479     return;
480   strcpy(ua_copy, user_agent);
481 
482   www->user_agent=ua_copy;
483 }
484 
485 
486 /**
487  * raptor_www_set_proxy:
488  * @www: WWW object
489  * @proxy: proxy string.
490  *
491  * Set the proxy for the WWW object.
492  *
493  * The @proxy usually a string of the form http://server.domain:port.
494  **/
495 void
raptor_www_set_proxy(raptor_www * www,const char * proxy)496 raptor_www_set_proxy(raptor_www* www, const char *proxy)
497 {
498   char *proxy_copy;
499 
500   if(!proxy)
501     return;
502 
503   proxy_copy=(char*)RAPTOR_MALLOC(cstring, strlen(proxy)+1);
504   if(!proxy_copy)
505     return;
506   strcpy(proxy_copy, proxy);
507 
508   www->proxy=proxy_copy;
509 }
510 
511 
512 /**
513  * raptor_www_set_http_accept:
514  * @www: #raptor_www class
515  * @value: Accept: header value or NULL to have an empty one.
516  *
517  * Set HTTP Accept header.
518  *
519  **/
520 void
raptor_www_set_http_accept(raptor_www * www,const char * value)521 raptor_www_set_http_accept(raptor_www* www, const char *value)
522 {
523   char *value_copy;
524   size_t len=8; /* strlen("Accept:")+1 */
525 
526   if(value)
527     len+=1+strlen(value); /* " "+value */
528 
529   value_copy=(char*)RAPTOR_MALLOC(cstring, len);
530   if(!value_copy)
531     return;
532   www->http_accept=value_copy;
533 
534   strcpy(value_copy, "Accept:");
535   value_copy+=7;
536   if(value) {
537     *value_copy++=' ';
538     strcpy(value_copy, value);
539   }
540 
541 #if RAPTOR_DEBUG > 1
542   RAPTOR_DEBUG2("Using Accept header: '%s'\n", www->http_accept);
543 #endif
544 }
545 
546 
547 /**
548  * raptor_www_set_connection_timeout:
549  * @www: WWW object
550  * @timeout: Timeout in seconds
551  *
552  * Set WWW connection timeout
553  **/
554 void
raptor_www_set_connection_timeout(raptor_www * www,int timeout)555 raptor_www_set_connection_timeout(raptor_www* www, int timeout)
556 {
557   www->connection_timeout=timeout;
558 }
559 
560 
561 /**
562  * raptor_www_set_http_cache_control:
563  * @www: WWW object
564  * @cache_control: Cache-Control header value (or NULL to disable)
565  *
566  * Set HTTP Cache-Control:header (default none)
567  *
568  * The @cache_control value can be a string to set it, "" to send
569  * a blank header or NULL to not set the header at all.
570  *
571  * Return value: non-0 on failure
572  **/
573 int
raptor_www_set_http_cache_control(raptor_www * www,const char * cache_control)574 raptor_www_set_http_cache_control(raptor_www* www, const char* cache_control)
575 {
576   char *cache_control_copy;
577   const char* const header="Cache-Control:";
578   const size_t header_len=14; /* strlen("Cache-Control:") */
579   size_t len;
580 
581   RAPTOR_ASSERT((strlen(header) != header_len), "Cache-Control header length is wrong");
582 
583   if(www->cache_control) {
584     RAPTOR_FREE(cstring, www->cache_control);
585     www->cache_control=NULL;
586   }
587 
588   if(!cache_control) {
589     www->cache_control=NULL;
590     return 0;
591   }
592 
593   len=header_len + 1 +strlen(cache_control) + 1; /* header+" "+cache_control+"\0" */
594 
595   cache_control_copy=(char*)RAPTOR_MALLOC(cstring, len);
596   if(!cache_control_copy)
597     return 1;
598 
599   www->cache_control=cache_control_copy;
600 
601   strncpy(cache_control_copy, header, header_len);
602   cache_control_copy+= header_len;
603   if(*cache_control) {
604     *cache_control_copy++=' ';
605     strcpy(cache_control_copy, cache_control);
606   }
607 
608 #if RAPTOR_DEBUG > 1
609   RAPTOR_DEBUG2("Using Cache-Control header: '%s'\n", www->cache_control);
610 #endif
611 
612   return 0;
613 }
614 
615 
616 /**
617  * raptor_www_set_uri_filter:
618  * @www: WWW object
619  * @filter: URI filter function
620  * @user_data: User data to pass to filter function
621  *
622  * Set URI filter function for WWW retrieval.
623  **/
624 void
raptor_www_set_uri_filter(raptor_www * www,raptor_uri_filter_func filter,void * user_data)625 raptor_www_set_uri_filter(raptor_www* www,
626                           raptor_uri_filter_func filter,
627                           void *user_data)
628 {
629   www->uri_filter=filter;
630   www->uri_filter_user_data=user_data;
631 }
632 
633 
634 /**
635  * raptor_www_get_connection:
636  * @www: #raptor_www object
637  *
638  * Get WWW library connection object.
639  *
640  * Return the internal WWW connection handle.  For libcurl, this
641  * returns the CURL handle and for libxml the context.  Otherwise
642  * it returns NULL.
643  *
644  * Return value: connection pointer
645  **/
646 void*
raptor_www_get_connection(raptor_www * www)647 raptor_www_get_connection(raptor_www* www)
648 {
649 #ifdef RAPTOR_WWW_NONE
650   return NULL;
651 #endif
652 
653 #ifdef RAPTOR_WWW_LIBCURL
654   return www->curl_handle;
655 #endif
656 
657 #ifdef RAPTOR_WWW_LIBXML
658   return www->ctxt;
659 #endif
660 
661 #ifdef RAPTOR_WWW_LIBFETCH
662   return NULL;
663 #endif
664 
665   return NULL;
666 }
667 
668 
669 /**
670  * raptor_www_abort:
671  * @www: WWW object
672  * @reason: abort reason message
673  *
674  * Abort an ongoing raptor WWW operation and pass back a reason.
675  *
676  * This is typically used within one of the raptor WWW handlers
677  * when retrieval need no longer continue due to another
678  * processing issue or error.
679  **/
680 void
raptor_www_abort(raptor_www * www,const char * reason)681 raptor_www_abort(raptor_www* www, const char *reason)
682 {
683   www->failed=1;
684 }
685 
686 
687 void
raptor_www_error(raptor_www * www,const char * message,...)688 raptor_www_error(raptor_www* www, const char *message, ...)
689 {
690   va_list arguments;
691 
692   va_start(arguments, message);
693 
694   raptor_log_error_varargs(www->world,
695                            RAPTOR_LOG_LEVEL_ERROR,
696                            www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].handler,
697                            www->error_handlers.handlers[RAPTOR_LOG_LEVEL_ERROR].user_data,
698                            &www->locator,
699                            message, arguments);
700 
701   va_end(arguments);
702 }
703 
704 
705 static int
raptor_www_file_handle_fetch(raptor_www * www,FILE * fh)706 raptor_www_file_handle_fetch(raptor_www* www, FILE* fh)
707 {
708   unsigned char buffer[RAPTOR_WWW_BUFFER_SIZE+1];
709 
710   while(!feof(fh)) {
711     int len=fread(buffer, 1, RAPTOR_WWW_BUFFER_SIZE, fh);
712     if(len > 0) {
713       www->total_bytes += len;
714       buffer[len]='\0';
715 
716       if(www->write_bytes)
717         www->write_bytes(www, www->write_bytes_userdata, buffer, len, 1);
718     }
719 
720     if(feof(fh) || www->failed)
721       break;
722   }
723 
724   if(!www->failed)
725     www->status_code=200;
726 
727   return www->failed;
728 }
729 
730 
731 static int
raptor_www_file_fetch(raptor_www * www)732 raptor_www_file_fetch(raptor_www* www)
733 {
734   char *filename;
735   FILE *fh;
736   unsigned char *uri_string=raptor_uri_as_string_v2(www->world, www->uri);
737 #if defined(HAVE_UNISTD_H) && defined(HAVE_SYS_STAT_H)
738   struct stat buf;
739 #endif
740 
741   www->status_code=200;
742 
743   filename=raptor_uri_uri_string_to_filename(uri_string);
744   if(!filename) {
745     raptor_www_error(www, "Not a file: URI");
746     return 1;
747   }
748 
749 #if defined(HAVE_UNISTD_H) && defined(HAVE_SYS_STAT_H)
750   if(!stat(filename, &buf) && S_ISDIR(buf.st_mode)) {
751     raptor_www_error(www, "Cannot read from a directory '%s'", filename);
752     RAPTOR_FREE(cstring, filename);
753     www->status_code=404;
754     return 1;
755   }
756 #endif
757 
758   fh=fopen(filename, "rb");
759   if(!fh) {
760     raptor_www_error(www, "file '%s' open failed - %s",
761                      filename, strerror(errno));
762     RAPTOR_FREE(cstring, filename);
763     www->status_code=(errno == EACCES) ? 403: 404;
764     www->failed=1;
765 
766     return www->failed;
767   }
768 
769   raptor_www_file_handle_fetch(www, fh);
770   fclose(fh);
771 
772   RAPTOR_FREE(cstring, filename);
773 
774   return www->failed;
775 }
776 
777 
778 /**
779 * raptor_www_fetch:
780 * @www: WWW object
781 * @uri: URI to read from
782 *
783 * Start a WWW content retrieval for the given URI, returning data via the write_bytes handler.
784 *
785 * Return value: non-0 on failure.
786 **/
787 int
raptor_www_fetch(raptor_www * www,raptor_uri * uri)788 raptor_www_fetch(raptor_www *www, raptor_uri *uri)
789 {
790   int status=1;
791 
792   www->uri=raptor_new_uri_for_retrieval_v2(www->world, uri);
793 
794   www->locator.uri=uri;
795   www->locator.line= -1;
796   www->locator.column= -1;
797 
798   if(www->uri_filter)
799     if(www->uri_filter(www->uri_filter_user_data, uri))
800       return status;
801 
802 #ifdef RAPTOR_WWW_NONE
803   status=raptor_www_file_fetch(www);
804 #else
805 
806   if(raptor_uri_uri_string_is_file_uri(raptor_uri_as_string_v2(www->world, www->uri)))
807     status=raptor_www_file_fetch(www);
808   else {
809 #ifdef RAPTOR_WWW_LIBCURL
810     status=raptor_www_curl_fetch(www);
811 #endif
812 
813 #ifdef RAPTOR_WWW_LIBXML
814     status=raptor_www_libxml_fetch(www);
815 #endif
816 
817 #ifdef RAPTOR_WWW_LIBFETCH
818     status=raptor_www_libfetch_fetch(www);
819 #endif
820   }
821 
822 #endif
823   if(!status && www->status_code && www->status_code != 200){
824     raptor_www_error(www, "Resolving URI failed with HTTP status %d",
825                      www->status_code);
826     status=1;
827   }
828 
829   www->failed=status;
830 
831   return www->failed;
832 }
833 
834 
835 static void
raptor_www_fetch_to_string_write_bytes(raptor_www * www,void * userdata,const void * ptr,size_t size,size_t nmemb)836 raptor_www_fetch_to_string_write_bytes(raptor_www* www, void *userdata,
837                                        const void *ptr, size_t size,
838                                        size_t nmemb)
839 {
840   raptor_stringbuffer* sb=(raptor_stringbuffer*)userdata;
841   int len=size*nmemb;
842 
843   raptor_stringbuffer_append_counted_string(sb, (unsigned char*)ptr, len, 1);
844 }
845 
846 
847 /**
848  * raptor_www_fetch_to_string:
849  * @www: raptor_www object
850  * @uri: raptor_uri to retrieve
851  * @string_p: pointer to location to hold string
852  * @length_p: pointer to location to hold length of string (or NULL)
853  * @malloc_handler: pointer to malloc to use to make string (or NULL)
854  *
855  * Start a WWW content retrieval for the given URI, returning the data in a new string.
856  *
857  * If malloc_handler is null, raptor will allocate it using it's
858  * own memory allocator.  *string_p is set to NULL on failure (and
859  * *length_p to 0 if length_p is not NULL).
860  *
861  * Return value: non-0 on failure
862  **/
863 RAPTOR_EXTERN_C
864 int
raptor_www_fetch_to_string(raptor_www * www,raptor_uri * uri,void ** string_p,size_t * length_p,void * (* malloc_handler)(size_t size))865 raptor_www_fetch_to_string(raptor_www *www, raptor_uri *uri,
866                            void **string_p, size_t *length_p,
867                            void *(*malloc_handler)(size_t size))
868 {
869   raptor_stringbuffer *sb=NULL;
870   void *str=NULL;
871   raptor_www_write_bytes_handler saved_write_bytes;
872   void *saved_write_bytes_userdata;
873 
874   sb=raptor_new_stringbuffer();
875   if(!sb)
876     return 1;
877 
878   if(length_p)
879     *length_p=0;
880 
881   saved_write_bytes=www->write_bytes;
882   saved_write_bytes_userdata=www->write_bytes_userdata;
883   raptor_www_set_write_bytes_handler(www, raptor_www_fetch_to_string_write_bytes, sb);
884 
885   if(raptor_www_fetch(www, uri))
886     str=NULL;
887   else {
888     size_t len=raptor_stringbuffer_length(sb);
889     if(len) {
890       str=(void*)malloc_handler(len+1);
891       if(str) {
892         raptor_stringbuffer_copy_to_string(sb, (unsigned char*)str, len+1);
893         *string_p=str;
894         if(length_p)
895           *length_p=len;
896       }
897     }
898   }
899 
900   if(sb)
901     raptor_free_stringbuffer(sb);
902 
903   raptor_www_set_write_bytes_handler(www, saved_write_bytes, saved_write_bytes_userdata);
904 
905   return (str == NULL);
906 }
907 
908 
909 /**
910  * raptor_www_get_final_uri:
911  * @www: #raptor_www object
912  *
913  * Get the WWW final resolved URI.
914  *
915  * This returns the URI used after any protocol redirection.
916  *
917  * Return value: a new URI or NULL if not known.
918  **/
919 raptor_uri*
raptor_www_get_final_uri(raptor_www * www)920 raptor_www_get_final_uri(raptor_www* www)
921 {
922   return www->final_uri ? raptor_uri_copy_v2(www->world, www->final_uri) : NULL;
923 }
924 
925 
926 /**
927  * raptor_www_set_final_uri_handler:
928  * @www: WWW object
929  * @handler: content type handler function
930  * @user_data: content type handler data
931  *
932  * Set the handler to receive the HTTP Content-Type header value.
933  *
934  * This is called if or when the value is discovered during retrieval
935  * by the raptor_www implementation.  Not all implementations provide
936  * access to this.
937  **/
938 void
raptor_www_set_final_uri_handler(raptor_www * www,raptor_www_final_uri_handler handler,void * user_data)939 raptor_www_set_final_uri_handler(raptor_www* www,
940                                  raptor_www_final_uri_handler handler,
941                                  void *user_data)
942 {
943   www->final_uri_handler=handler;
944   www->final_uri_userdata=user_data;
945 }
946