1 /*
2  * Copyright (c) 2013 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * Example for HTML parsing using libwget
22  *
23  * Changelog
24  * 03.01.2014  Tim Ruehsen  created
25  *
26  * Demonstrate how to extract URIs from HTML files using callback functions.
27  * We don't care about character encoding in this example.
28  *
29  */
30 
31 #include <unistd.h>
32 #include <wget.h>
33 
html_parse_localfile(const char * fname)34 static void html_parse_localfile(const char *fname)
35 {
36 	char *data, *data_allocated;
37 	size_t len;
38 
39 	if ((data_allocated = data = wget_read_file(fname, &len))) {
40 		const char *encoding = NULL;
41 
42 		if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
43 			// Big-endian UTF-16
44 			encoding = "UTF-16BE";
45 
46 			// adjust behind BOM, ignore trailing single byte
47 			data += 2;
48 			len -= 2;
49 		} else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
50 			// Little-endian UTF-16
51 			encoding = "UTF-16LE";
52 
53 			// adjust behind BOM
54 			data += 2;
55 			len -= 2;
56 		} else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
57 			// UTF-8
58 			encoding = "UTF-8";
59 
60 			// adjust behind BOM
61 			data += 3;
62 			len -= 3;
63 		}
64 
65 		if (encoding)
66 			printf("URI encoding '%s' set by BOM\n", encoding);
67 
68 		if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
69 			size_t n;
70 			char *utf8;
71 
72 			len -= len & 1; // ignore single trailing byte, else charset conversion fails
73 
74 			if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
75 				printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
76 				wget_xfree(data_allocated);
77 				data_allocated = data = utf8;
78 			} else {
79 				printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
80 				return;
81 			}
82 		}
83 
84 		wget_html_parsed_result *res  = wget_html_get_urls_inline(data, NULL, NULL);
85 
86 		if (encoding) {
87 			if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
88 				printf("Encoding '%s' as stated in document has been ignored\n", encoding);
89 		}
90 
91 		for (int it = 0; it < wget_vector_size(res->uris); it++) {
92 			wget_html_parsed_url *html_url = wget_vector_get(res->uris, it);
93 			wget_string *url = &html_url->url;
94 
95 			printf("  %s.%s '%.*s'", html_url->tag, html_url->attr, (int) url->len, url->p);
96 			if (html_url->download.p)
97 				printf(" (save as '%.*s')", (int) html_url->download.len, html_url->download.p);
98 			printf("\n");
99 		}
100 
101 		wget_xfree(data_allocated);
102 		wget_html_free_urls_inline(&res);
103 	}
104 }
105 
main(int argc,const char * const * argv)106 int main(int argc, const char *const *argv)
107 {
108 /*
109 	wget_global_init(
110 		WGET_DEBUG_STREAM, stderr,
111 		WGET_ERROR_STREAM, stderr,
112 		WGET_INFO_STREAM, stdout,
113 		NULL);
114 */
115 
116 	if (!isatty(STDIN_FILENO)) {
117 		// read HTML data from STDIN
118 		html_parse_localfile("-");
119 	} else {
120 		// parse CSS files given as arguments
121 		int argpos;
122 
123 		for (argpos = 1; argpos < argc; argpos++) {
124 			printf("%s:\n", argv[argpos]);
125 
126 			// use '-' as filename for STDIN
127 			html_parse_localfile(argv[argpos]);
128 		}
129 	}
130 
131 	return 0;
132 }
133