1 /*
2  * Copyright (c) 2013 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * Advanced example for CSS parsing using libwget
22  *
23  * Changelog
24  * 15.01.2013  Tim Ruehsen  created
25  *
26  * Demonstrate how to extract URIs from CSS files into a vector.
27  *
28  */
29 
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <wget.h>
34 
35 // use the helper routines provided by libwget
36 #define info_printf        wget_info_printf
37 // #define error_printf       wget_error_printf
38 #define error_printf_exit  wget_error_printf_exit
39 
usage(const char * myname)40 static void WGET_GCC_NORETURN usage(const char *myname)
41 {
42 	error_printf_exit(
43 		"\nUsage: %s [options] file...\n"\
44 		"  --base <URI>          Default base for relative URIs, default: http://www.example.com\n"\
45 		"  --encoding <Encoding> Default file character encoding, default: iso-8859-1\n"\
46 		"\n"\
47 		"  Examples:\n"\
48 		"    %s --base http://www.mydomain.com x.css\n"\
49 		"    cat x.css | %s --base http://www.mydomain.com -\n"\
50 		"    %s http://www.example.com\n"\
51 		"\n"\
52 		"  Print URIs as found (without a base):\n"\
53 		"    %s --base \"\" x.css\n\n",
54 		myname, myname, myname, myname, myname);
55 }
56 
main(int argc,const char ** argv)57 int main(int argc, const char **argv)
58 {
59 	// Base URI for converting relative to absolute URIs
60 	const char *
61 		base = NULL;
62 
63 	// We assume that base is encoded in the local charset.
64 	const char *
65 		local_encoding = wget_local_charset_encoding();
66 
67 	// parsed 'base'
68 	wget_iri
69 		*base_uri;
70 
71 	// Character encoding of CSS file content
72 	// An HTTP response may contain the encoding in the Content-Type header,
73 	// see https://stackoverflow.com/questions/2526033/why-specify-charset-utf-8-in-your-css-file
74 	const char *
75 		css_encoding = NULL;
76 
77 	int
78 		argpos;
79 
80 	// We want the libwget error messages be printed to STDERR.
81 	// From here on, we can call wget_error_printf, etc.
82 	wget_logger_set_stream(wget_get_logger(WGET_LOGGER_ERROR), stderr);
83 
84 	// We want the libwget info messages be printed to STDOUT.
85 	// From here on, we can call wget_info_printf, etc.
86 	wget_logger_set_stream(wget_get_logger(WGET_LOGGER_INFO), stdout);
87 
88 	// parse options
89 	for (argpos = 1; argpos < argc; argpos++) {
90 		if (!strcmp(argv[argpos], "--base") && argc - argpos > 1) {
91 			base = argv[++argpos];
92 			info_printf("Local URI encoding = '%s'\n", local_encoding);
93 		} else if (!strcmp(argv[argpos], "--encoding") && argc - argpos > 1) {
94 			css_encoding = argv[++argpos];
95 		} else if (!strcmp(argv[argpos], "--")) {
96 			argpos++;
97 			break;
98 		} else if (argv[argpos][0] == '-') {
99 			usage(argv[0]);
100 		} else
101 			break;
102 	}
103 
104 	// All URIs are converted into UTF-8 charset.
105 	// That's why we need the local encoding (aka 'encoding of base URI') here.
106 	base_uri = base ? wget_iri_parse(base, local_encoding) : NULL;
107 
108 	for (;argpos < argc; argpos++) {
109 		// use '-' as filename for STDIN
110 		wget_vector *css_urls = wget_css_get_urls_from_localfile(argv[argpos], base_uri, &css_encoding);
111 
112 		if (wget_vector_size(css_urls) > 0) {
113 			info_printf("URL encoding for %s is '%s':\n", argv[argpos], css_encoding ? css_encoding : "UTF-8");
114 
115 			for (int it = 0; it < wget_vector_size(css_urls); it++) {
116 				wget_css_parsed_url *css_url = wget_vector_get(css_urls, it);
117 				if (css_url->abs_url)
118 					info_printf("  %s -> %s\n", css_url->url, css_url->abs_url);
119 				else
120 					info_printf("  %s\n", css_url->url);
121 			}
122 
123 			info_printf("\n");
124 		}
125 
126 		wget_vector_free(&css_urls);
127 	}
128 
129 	wget_iri_free(&base_uri);
130 
131 	return 0;
132 }
133