1 /*
2 * Copyright (c) 2013 Tim Ruehsen
3 * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4 *
5 * This file is part of libwget.
6 *
7 * Libwget is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Libwget is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libwget. If not, see <https://www.gnu.org/licenses/>.
19 *
20 *
21 * Example for HTML parsing using libwget
22 *
23 * Changelog
24 * 03.01.2014 Tim Ruehsen created
25 *
26 * Demonstrate how to extract URIs from HTML files using callback functions.
27 * We don't care about character encoding in this example.
28 *
29 */
30
31 #include <unistd.h>
32 #include <wget.h>
33
html_parse_localfile(const char * fname)34 static void html_parse_localfile(const char *fname)
35 {
36 char *data, *data_allocated;
37 size_t len;
38
39 if ((data_allocated = data = wget_read_file(fname, &len))) {
40 const char *encoding = NULL;
41
42 if ((unsigned char)data[0] == 0xFE && (unsigned char)data[1] == 0xFF) {
43 // Big-endian UTF-16
44 encoding = "UTF-16BE";
45
46 // adjust behind BOM, ignore trailing single byte
47 data += 2;
48 len -= 2;
49 } else if ((unsigned char)data[0] == 0xFF && (unsigned char)data[1] == 0xFE) {
50 // Little-endian UTF-16
51 encoding = "UTF-16LE";
52
53 // adjust behind BOM
54 data += 2;
55 len -= 2;
56 } else if ((unsigned char)data[0] == 0xEF && (unsigned char)data[1] == 0xBB && (unsigned char)data[2] == 0xBF) {
57 // UTF-8
58 encoding = "UTF-8";
59
60 // adjust behind BOM
61 data += 3;
62 len -= 3;
63 }
64
65 if (encoding)
66 printf("URI encoding '%s' set by BOM\n", encoding);
67
68 if (!wget_strncasecmp_ascii(encoding, "UTF-16", 6)) {
69 size_t n;
70 char *utf8;
71
72 len -= len & 1; // ignore single trailing byte, else charset conversion fails
73
74 if (wget_memiconv(encoding, data, len, "UTF-8", &utf8, &n) == 0) {
75 printf("Convert non-ASCII encoding '%s' to UTF-8\n", encoding);
76 wget_xfree(data_allocated);
77 data_allocated = data = utf8;
78 } else {
79 printf("Failed to convert non-ASCII encoding '%s' to UTF-8, skip parsing\n", encoding);
80 return;
81 }
82 }
83
84 wget_html_parsed_result *res = wget_html_get_urls_inline(data, NULL, NULL);
85
86 if (encoding) {
87 if (res->encoding && wget_strcasecmp_ascii(encoding, res->encoding))
88 printf("Encoding '%s' as stated in document has been ignored\n", encoding);
89 }
90
91 for (int it = 0; it < wget_vector_size(res->uris); it++) {
92 wget_html_parsed_url *html_url = wget_vector_get(res->uris, it);
93 wget_string *url = &html_url->url;
94
95 printf(" %s.%s '%.*s'", html_url->tag, html_url->attr, (int) url->len, url->p);
96 if (html_url->download.p)
97 printf(" (save as '%.*s')", (int) html_url->download.len, html_url->download.p);
98 printf("\n");
99 }
100
101 wget_xfree(data_allocated);
102 wget_html_free_urls_inline(&res);
103 }
104 }
105
main(int argc,const char * const * argv)106 int main(int argc, const char *const *argv)
107 {
108 /*
109 wget_global_init(
110 WGET_DEBUG_STREAM, stderr,
111 WGET_ERROR_STREAM, stderr,
112 WGET_INFO_STREAM, stdout,
113 NULL);
114 */
115
116 if (!isatty(STDIN_FILENO)) {
117 // read HTML data from STDIN
118 html_parse_localfile("-");
119 } else {
120 // parse CSS files given as arguments
121 int argpos;
122
123 for (argpos = 1; argpos < argc; argpos++) {
124 printf("%s:\n", argv[argpos]);
125
126 // use '-' as filename for STDIN
127 html_parse_localfile(argv[argpos]);
128 }
129 }
130
131 return 0;
132 }
133