1 /**
2  * @file feed_parser.c  parsing of different feed formats
3  *
4  * Copyright (C) 2008-2017 Lars Windolf <lars.windolf@gmx.de>
5  *
6  * This program is free software; you can redistribute it and/or modify
7  * it under the terms of the GNU General Public License as published by
8  * the Free Software Foundation; either version 2 of the License, or
9  * (at your option) any later version.
10  *
11  * This program is distributed in the hope that it will be useful,
12  * but WITHOUT ANY WARRANTY; without even the implied warranty of
13  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  * GNU General Public License for more details.
15  *
16  * You should have received a copy of the GNU General Public License
17  * along with this program; if not, write to the Free Software
18  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
19  */
20 
21 #include <string.h>
22 
23 #include "common.h"
24 #include "debug.h"
25 #include "html.h"
26 #include "metadata.h"
27 #include "xml.h"
28 #include "parsers/cdf_channel.h"
29 #include "parsers/rss_channel.h"
30 #include "parsers/atom10.h"
31 #include "parsers/pie_feed.h"
32 
33 static GSList *feedHandlers = NULL;	/**< list of available parser implementations */
34 
35 struct feed_type {
36 	gint id_num;
37 	gchar *id_str;
38 };
39 
40 static GSList *
feed_parsers_get_list(void)41 feed_parsers_get_list (void)
42 {
43 	if (feedHandlers)
44 		return feedHandlers;
45 
46 	feedHandlers = g_slist_append (feedHandlers, rss_init_feed_handler ());
47 	feedHandlers = g_slist_append (feedHandlers, cdf_init_feed_handler ());
48 	feedHandlers = g_slist_append (feedHandlers, atom10_init_feed_handler ());  /* Must be before pie */
49 	feedHandlers = g_slist_append (feedHandlers, pie_init_feed_handler ());
50 
51 	return feedHandlers;
52 }
53 
54 const gchar *
feed_type_fhp_to_str(feedHandlerPtr fhp)55 feed_type_fhp_to_str (feedHandlerPtr fhp)
56 {
57 	if (!fhp)
58 		return NULL;
59 	return fhp->typeStr;
60 }
61 
62 feedHandlerPtr
feed_type_str_to_fhp(const gchar * str)63 feed_type_str_to_fhp (const gchar *str)
64 {
65 	GSList *iter;
66 	feedHandlerPtr fhp = NULL;
67 
68 	if (!str)
69 		return NULL;
70 
71 	if (strstr(str, "pie"))
72 		return feed_type_str_to_fhp ("atom");
73 
74 	for(iter = feed_parsers_get_list (); iter != NULL; iter = iter->next) {
75 		fhp = (feedHandlerPtr)iter->data;
76 		if(!strcmp(str, fhp->typeStr))
77 			return fhp;
78 	}
79 
80 	return NULL;
81 }
82 
83 feedParserCtxtPtr
feed_create_parser_ctxt(void)84 feed_create_parser_ctxt (void)
85 {
86 	feedParserCtxtPtr ctxt;
87 
88 	ctxt = g_new0 (struct feedParserCtxt, 1);
89 	ctxt->tmpdata = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free);
90 	return ctxt;
91 }
92 
93 void
feed_free_parser_ctxt(feedParserCtxtPtr ctxt)94 feed_free_parser_ctxt (feedParserCtxtPtr ctxt)
95 {
96 	if (ctxt) {
97 		/* Don't free the itemset! */
98 		g_hash_table_destroy (ctxt->tmpdata);
99 		g_free (ctxt->title);
100 		g_free (ctxt);
101 	}
102 }
103 
104 /**
105  * This function tries to find a feed link for a given HTTP URI. It
106  * tries to download it. If it finds a valid feed source it parses
107  * this source instead into the given feed parsing context. It also
108  * replaces the HTTP URI with the found feed source.
109  */
110 static void
feed_parser_auto_discover(feedParserCtxtPtr ctxt)111 feed_parser_auto_discover (feedParserCtxtPtr ctxt)
112 {
113 	gchar	*source;
114 
115 	if (ctxt->feed->parseErrors)
116 		g_string_truncate (ctxt->feed->parseErrors, 0);
117 	else
118 		ctxt->feed->parseErrors = g_string_new(NULL);
119 
120 	debug1 (DEBUG_UPDATE, "Starting feed auto discovery (%s)", subscription_get_source (ctxt->subscription));
121 
122 	source = html_auto_discover_feed (ctxt->data, subscription_get_source (ctxt->subscription));
123 
124 	/* FIXME: we only need the !g_str_equal as a workaround after a 404 */
125 	if (source && !g_str_equal (source, subscription_get_source (ctxt->subscription))) {
126 		debug1 (DEBUG_UPDATE, "Discovered link: %s", source);
127 		ctxt->failed = FALSE;
128 		subscription_set_source (ctxt->subscription, source);
129 
130 		/* The feed that was processed wasn't the correct one, we need to redownload it.
131 		 * Cancel the update in case there's one in progress */
132 		subscription_cancel_update (ctxt->subscription);
133 		subscription_update (ctxt->subscription, FEED_REQ_RESET_TITLE);
134 		g_free (source);
135 	} else {
136 		debug0 (DEBUG_UPDATE, "No feed link found!");
137 		g_string_append (ctxt->feed->parseErrors, _("The URL you want Liferea to subscribe to points to a webpage and the auto discovery found no feeds on this page. Maybe this webpage just does not support feed auto discovery."));
138 	}
139 }
140 
141 /**
142  * General feed source parsing function. Parses the passed feed source
143  * and tries to determine the source type.
144  *
145  * @param ctxt		feed parsing context
146  *
147  * @returns FALSE if auto discovery is indicated,
148  *          TRUE if feed type was recognized and parsing was successful
149  */
150 gboolean
feed_parse(feedParserCtxtPtr ctxt)151 feed_parse (feedParserCtxtPtr ctxt)
152 {
153 	xmlNodePtr	cur;
154 	gboolean	success = FALSE;
155 
156 	debug_enter("feed_parse");
157 
158 	g_assert(NULL == ctxt->items);
159 
160 	ctxt->failed = TRUE;	/* reset on success ... */
161 
162 	if(ctxt->feed->parseErrors)
163 		g_string_truncate(ctxt->feed->parseErrors, 0);
164 	else
165 		ctxt->feed->parseErrors = g_string_new(NULL);
166 
167 	/* try to parse buffer with XML and to create a DOM tree */
168 	do {
169 		if(NULL == xml_parse_feed (ctxt)) {
170 			g_string_append_printf (ctxt->feed->parseErrors, _("XML error while reading feed! Feed \"%s\" could not be loaded!"), subscription_get_source (ctxt->subscription));
171 			break;
172 		}
173 
174 		if(NULL == (cur = xmlDocGetRootElement(ctxt->doc))) {
175 			g_string_append(ctxt->feed->parseErrors, _("Empty document!"));
176 			break;
177 		}
178 
179 		while(cur && xmlIsBlankNode(cur)) {
180 			cur = cur->next;
181 		}
182 
183 		if(!cur)
184 			break;
185 
186 		if(!cur->name) {
187 			g_string_append(ctxt->feed->parseErrors, _("Invalid XML!"));
188 			break;
189 		}
190 
191 		/* determine the syndication format and start parser */
192 		GSList *handlerIter = feed_parsers_get_list ();
193 		while(handlerIter) {
194 			feedHandlerPtr handler = (feedHandlerPtr)(handlerIter->data);
195 			if(handler && handler->checkFormat && (*(handler->checkFormat))(ctxt->doc, cur)) {
196 				/* free old temp. parsing data, don't free right after parsing because
197 				   it can be used until the last feed request is finished, move me
198 				   to the place where the last request in list otherRequests is
199 				   finished :-) */
200 				g_hash_table_destroy(ctxt->tmpdata);
201 				ctxt->tmpdata = g_hash_table_new_full(g_str_hash, g_str_equal, NULL, g_free);
202 
203 				/* we always drop old metadata */
204 				metadata_list_free(ctxt->subscription->metadata);
205 				ctxt->subscription->metadata = NULL;
206 				ctxt->failed = FALSE;
207 
208 				ctxt->feed->fhp = handler;
209 				(*(handler->feedParser))(ctxt, cur);
210 
211 				break;
212 			}
213 			handlerIter = handlerIter->next;
214 		}
215 	} while(0);
216 
217 	/* if the given URI isn't valid we need to start auto discovery */
218 	if(ctxt->failed)
219 		feed_parser_auto_discover (ctxt);
220 
221 	if(ctxt->failed) {
222 		/* Autodiscovery failed */
223 		/* test if we have a HTML page */
224 		if((strstr(ctxt->data, "<html>") || strstr(ctxt->data, "<HTML>") ||
225 		    strstr(ctxt->data, "<html ") || strstr(ctxt->data, "<HTML "))) {
226 			debug0(DEBUG_UPDATE, "HTML document detected!");
227 			g_string_append(ctxt->feed->parseErrors, _("Source points to HTML document."));
228 		} else {
229 			debug0(DEBUG_UPDATE, "neither a known feed type nor a HTML document!");
230 			g_string_append(ctxt->feed->parseErrors, _("Could not determine the feed type."));
231 		}
232 	} else {
233 		debug1(DEBUG_UPDATE, "discovered feed format: %s", feed_type_fhp_to_str(ctxt->feed->fhp));
234 		success = TRUE;
235 	}
236 
237 	if(ctxt->doc) {
238 		xmlFreeDoc(ctxt->doc);
239 		ctxt->doc = NULL;
240 	}
241 
242 	debug_exit("feed_parse");
243 
244 	return success;
245 }
246