1 /*
2 * Copyright (c) 2013 Tim Ruehsen
3 * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4 *
5 * This file is part of libwget.
6 *
7 * Libwget is free software: you can redistribute it and/or modify
8 * it under the terms of the GNU Lesser General Public License as published by
9 * the Free Software Foundation, either version 3 of the License, or
10 * (at your option) any later version.
11 *
12 * Libwget is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU Lesser General Public License for more details.
16 *
17 * You should have received a copy of the GNU Lesser General Public License
18 * along with libwget. If not, see <https://www.gnu.org/licenses/>.
19 *
20 *
21 * Extracting URLs from RSS feeds (https://cyber.harvard.edu/rss/rss.html)
22 *
23 * Changelog
24 * 21.12.2013 Tim Ruehsen created
25 *
26 */
27
28 #include <config.h>
29
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <c-ctype.h>
34
35 #include <wget.h>
36 #include "private.h"
37
38 struct rss_context {
39 wget_vector
40 *urls;
41 };
42
rss_get_url(void * context,int flags,const char * dir,const char * attr,const char * val,size_t len,size_t pos WGET_GCC_UNUSED)43 static void rss_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED)
44 {
45 struct rss_context *ctx = context;
46 wget_string * url;
47
48 if (!val || !len)
49 return;
50
51 if ((flags & XML_FLG_ATTRIBUTE)) {
52 if (!wget_strcasecmp_ascii(attr, "url") || !wget_strcasecmp_ascii(attr, "href")
53 || !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "domain")
54 || !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6))
55 {
56 for (;len && c_isspace(*val); val++, len--); // skip leading spaces
57 for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces
58
59 if (!(url = wget_malloc(sizeof(wget_string))))
60 return;
61
62 url->p = val;
63 url->len = len;
64
65 if (!ctx->urls)
66 ctx->urls = wget_vector_create(32, NULL);
67
68 wget_vector_add(ctx->urls, url);
69 }
70 }
71 else if ((flags & XML_FLG_CONTENT)) {
72 const char *elem = strrchr(dir, '/');
73
74 if (elem) {
75 elem++;
76
77 if (!wget_strcasecmp_ascii(elem, "guid") || !wget_strcasecmp_ascii(elem, "link")
78 || !wget_strcasecmp_ascii(elem, "comments") || !wget_strcasecmp_ascii(elem, "docs"))
79 {
80 for (;len && c_isspace(*val); val++, len--); // skip leading spaces
81 for (;len && c_isspace(val[len - 1]); len--); // skip trailing spaces
82
83 // debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len);
84
85 if (!(url = wget_malloc(sizeof(wget_string))))
86 return;
87
88 url->p = val;
89 url->len = len;
90
91 if (!ctx->urls)
92 ctx->urls = wget_vector_create(32, NULL);
93
94 wget_vector_add(ctx->urls, url);
95 }
96 }
97 }
98 }
99
wget_rss_get_urls_inline(const char * rss,wget_vector ** urls)100 void wget_rss_get_urls_inline(const char *rss, wget_vector **urls)
101 {
102 struct rss_context context = { .urls = NULL };
103
104 wget_xml_parse_buffer(rss, rss_get_url, &context, XML_HINT_REMOVE_EMPTY_CONTENT);
105
106 *urls = context.urls;
107 }
108