1 /*
2  * Copyright (c) 2013 Tim Ruehsen
3  * Copyright (c) 2015-2021 Free Software Foundation, Inc.
4  *
5  * This file is part of libwget.
6  *
7  * Libwget is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU Lesser General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version.
11  *
12  * Libwget is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU Lesser General Public License for more details.
16  *
17  * You should have received a copy of the GNU Lesser General Public License
18  * along with libwget.  If not, see <https://www.gnu.org/licenses/>.
19  *
20  *
21  * Extracting URLs from RSS feeds (https://cyber.harvard.edu/rss/rss.html)
22  *
23  * Changelog
24  * 21.12.2013  Tim Ruehsen  created
25  *
26  */
27 
28 #include <config.h>
29 
30 #include <unistd.h>
31 #include <stdlib.h>
32 #include <string.h>
33 #include <c-ctype.h>
34 
35 #include <wget.h>
36 #include "private.h"
37 
38 struct rss_context {
39 	wget_vector
40 		*urls;
41 };
42 
rss_get_url(void * context,int flags,const char * dir,const char * attr,const char * val,size_t len,size_t pos WGET_GCC_UNUSED)43 static void rss_get_url(void *context, int flags, const char *dir, const char *attr, const char *val, size_t len, size_t pos WGET_GCC_UNUSED)
44 {
45 	struct rss_context *ctx = context;
46 	wget_string * url;
47 
48 	if (!val || !len)
49 		return;
50 
51 	if ((flags & XML_FLG_ATTRIBUTE)) {
52 		if (!wget_strcasecmp_ascii(attr, "url") || !wget_strcasecmp_ascii(attr, "href")
53 			|| !wget_strcasecmp_ascii(attr, "src") || !wget_strcasecmp_ascii(attr, "domain")
54 			|| !wget_strcasecmp_ascii(attr, "xmlns") || !wget_strncasecmp_ascii(attr, "xmlns:", 6))
55 		{
56 			for (;len && c_isspace(*val); val++, len--); // skip leading spaces
57 			for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces
58 
59 			if (!(url = wget_malloc(sizeof(wget_string))))
60 				return;
61 
62 			url->p = val;
63 			url->len = len;
64 
65 			if (!ctx->urls)
66 				ctx->urls = wget_vector_create(32, NULL);
67 
68 			wget_vector_add(ctx->urls, url);
69 		}
70 	}
71 	else if ((flags & XML_FLG_CONTENT)) {
72 		const char *elem = strrchr(dir, '/');
73 
74 		if (elem) {
75 			elem++;
76 
77 			if (!wget_strcasecmp_ascii(elem, "guid") || !wget_strcasecmp_ascii(elem, "link")
78 				 || !wget_strcasecmp_ascii(elem, "comments") || !wget_strcasecmp_ascii(elem, "docs"))
79 			{
80 				for (;len && c_isspace(*val); val++, len--); // skip leading spaces
81 				for (;len && c_isspace(val[len - 1]); len--);  // skip trailing spaces
82 
83 				// debug_printf("#2 %02X %s %s '%.*s' %zd\n", flags, dir, attr, (int) len, val, len);
84 
85 				if (!(url = wget_malloc(sizeof(wget_string))))
86 					return;
87 
88 				url->p = val;
89 				url->len = len;
90 
91 				if (!ctx->urls)
92 					ctx->urls = wget_vector_create(32, NULL);
93 
94 				wget_vector_add(ctx->urls, url);
95 			}
96 		}
97 	}
98 }
99 
wget_rss_get_urls_inline(const char * rss,wget_vector ** urls)100 void wget_rss_get_urls_inline(const char *rss, wget_vector **urls)
101 {
102 	struct rss_context context = { .urls = NULL };
103 
104 	wget_xml_parse_buffer(rss, rss_get_url, &context, XML_HINT_REMOVE_EMPTY_CONTENT);
105 
106 	*urls = context.urls;
107 }
108