1 /**
2 * @file feed_parser.c parsing of different feed formats
3 *
4 * Copyright (C) 2008-2017 Lars Windolf <lars.windolf@gmx.de>
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, write to the Free Software
18 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
19 */
20
21 #include <string.h>
22
23 #include "common.h"
24 #include "debug.h"
25 #include "html.h"
26 #include "metadata.h"
27 #include "xml.h"
28 #include "parsers/cdf_channel.h"
29 #include "parsers/rss_channel.h"
30 #include "parsers/atom10.h"
31 #include "parsers/pie_feed.h"
32
33 static GSList *feedHandlers = NULL; /**< list of available parser implementations */
34
35 struct feed_type {
36 gint id_num;
37 gchar *id_str;
38 };
39
40 static GSList *
feed_parsers_get_list(void)41 feed_parsers_get_list (void)
42 {
43 if (feedHandlers)
44 return feedHandlers;
45
46 feedHandlers = g_slist_append (feedHandlers, rss_init_feed_handler ());
47 feedHandlers = g_slist_append (feedHandlers, cdf_init_feed_handler ());
48 feedHandlers = g_slist_append (feedHandlers, atom10_init_feed_handler ()); /* Must be before pie */
49 feedHandlers = g_slist_append (feedHandlers, pie_init_feed_handler ());
50
51 return feedHandlers;
52 }
53
54 const gchar *
feed_type_fhp_to_str(feedHandlerPtr fhp)55 feed_type_fhp_to_str (feedHandlerPtr fhp)
56 {
57 if (!fhp)
58 return NULL;
59 return fhp->typeStr;
60 }
61
62 feedHandlerPtr
feed_type_str_to_fhp(const gchar * str)63 feed_type_str_to_fhp (const gchar *str)
64 {
65 GSList *iter;
66 feedHandlerPtr fhp = NULL;
67
68 if (!str)
69 return NULL;
70
71 if (strstr(str, "pie"))
72 return feed_type_str_to_fhp ("atom");
73
74 for(iter = feed_parsers_get_list (); iter != NULL; iter = iter->next) {
75 fhp = (feedHandlerPtr)iter->data;
76 if(!strcmp(str, fhp->typeStr))
77 return fhp;
78 }
79
80 return NULL;
81 }
82
83 feedParserCtxtPtr
feed_create_parser_ctxt(void)84 feed_create_parser_ctxt (void)
85 {
86 feedParserCtxtPtr ctxt;
87
88 ctxt = g_new0 (struct feedParserCtxt, 1);
89 ctxt->tmpdata = g_hash_table_new_full (g_str_hash, g_str_equal, NULL, g_free);
90 return ctxt;
91 }
92
93 void
feed_free_parser_ctxt(feedParserCtxtPtr ctxt)94 feed_free_parser_ctxt (feedParserCtxtPtr ctxt)
95 {
96 if (ctxt) {
97 /* Don't free the itemset! */
98 g_hash_table_destroy (ctxt->tmpdata);
99 g_free (ctxt->title);
100 g_free (ctxt);
101 }
102 }
103
104 /**
105 * This function tries to find a feed link for a given HTTP URI. It
106 * tries to download it. If it finds a valid feed source it parses
107 * this source instead into the given feed parsing context. It also
108 * replaces the HTTP URI with the found feed source.
109 */
110 static void
feed_parser_auto_discover(feedParserCtxtPtr ctxt)111 feed_parser_auto_discover (feedParserCtxtPtr ctxt)
112 {
113 gchar *source;
114
115 if (ctxt->feed->parseErrors)
116 g_string_truncate (ctxt->feed->parseErrors, 0);
117 else
118 ctxt->feed->parseErrors = g_string_new(NULL);
119
120 debug1 (DEBUG_UPDATE, "Starting feed auto discovery (%s)", subscription_get_source (ctxt->subscription));
121
122 source = html_auto_discover_feed (ctxt->data, subscription_get_source (ctxt->subscription));
123
124 /* FIXME: we only need the !g_str_equal as a workaround after a 404 */
125 if (source && !g_str_equal (source, subscription_get_source (ctxt->subscription))) {
126 debug1 (DEBUG_UPDATE, "Discovered link: %s", source);
127 ctxt->failed = FALSE;
128 subscription_set_source (ctxt->subscription, source);
129
130 /* The feed that was processed wasn't the correct one, we need to redownload it.
131 * Cancel the update in case there's one in progress */
132 subscription_cancel_update (ctxt->subscription);
133 subscription_update (ctxt->subscription, FEED_REQ_RESET_TITLE);
134 g_free (source);
135 } else {
136 debug0 (DEBUG_UPDATE, "No feed link found!");
137 g_string_append (ctxt->feed->parseErrors, _("The URL you want Liferea to subscribe to points to a webpage and the auto discovery found no feeds on this page. Maybe this webpage just does not support feed auto discovery."));
138 }
139 }
140
141 /**
142 * General feed source parsing function. Parses the passed feed source
143 * and tries to determine the source type.
144 *
145 * @param ctxt feed parsing context
146 *
147 * @returns FALSE if auto discovery is indicated,
148 * TRUE if feed type was recognized and parsing was successful
149 */
150 gboolean
feed_parse(feedParserCtxtPtr ctxt)151 feed_parse (feedParserCtxtPtr ctxt)
152 {
153 xmlNodePtr cur;
154 gboolean success = FALSE;
155
156 debug_enter("feed_parse");
157
158 g_assert(NULL == ctxt->items);
159
160 ctxt->failed = TRUE; /* reset on success ... */
161
162 if(ctxt->feed->parseErrors)
163 g_string_truncate(ctxt->feed->parseErrors, 0);
164 else
165 ctxt->feed->parseErrors = g_string_new(NULL);
166
167 /* try to parse buffer with XML and to create a DOM tree */
168 do {
169 if(NULL == xml_parse_feed (ctxt)) {
170 g_string_append_printf (ctxt->feed->parseErrors, _("XML error while reading feed! Feed \"%s\" could not be loaded!"), subscription_get_source (ctxt->subscription));
171 break;
172 }
173
174 if(NULL == (cur = xmlDocGetRootElement(ctxt->doc))) {
175 g_string_append(ctxt->feed->parseErrors, _("Empty document!"));
176 break;
177 }
178
179 while(cur && xmlIsBlankNode(cur)) {
180 cur = cur->next;
181 }
182
183 if(!cur)
184 break;
185
186 if(!cur->name) {
187 g_string_append(ctxt->feed->parseErrors, _("Invalid XML!"));
188 break;
189 }
190
191 /* determine the syndication format and start parser */
192 GSList *handlerIter = feed_parsers_get_list ();
193 while(handlerIter) {
194 feedHandlerPtr handler = (feedHandlerPtr)(handlerIter->data);
195 if(handler && handler->checkFormat && (*(handler->checkFormat))(ctxt->doc, cur)) {
196 /* free old temp. parsing data, don't free right after parsing because
197 it can be used until the last feed request is finished, move me
198 to the place where the last request in list otherRequests is
199 finished :-) */
200 g_hash_table_destroy(ctxt->tmpdata);
201 ctxt->tmpdata = g_hash_table_new_full(g_str_hash, g_str_equal, NULL, g_free);
202
203 /* we always drop old metadata */
204 metadata_list_free(ctxt->subscription->metadata);
205 ctxt->subscription->metadata = NULL;
206 ctxt->failed = FALSE;
207
208 ctxt->feed->fhp = handler;
209 (*(handler->feedParser))(ctxt, cur);
210
211 break;
212 }
213 handlerIter = handlerIter->next;
214 }
215 } while(0);
216
217 /* if the given URI isn't valid we need to start auto discovery */
218 if(ctxt->failed)
219 feed_parser_auto_discover (ctxt);
220
221 if(ctxt->failed) {
222 /* Autodiscovery failed */
223 /* test if we have a HTML page */
224 if((strstr(ctxt->data, "<html>") || strstr(ctxt->data, "<HTML>") ||
225 strstr(ctxt->data, "<html ") || strstr(ctxt->data, "<HTML "))) {
226 debug0(DEBUG_UPDATE, "HTML document detected!");
227 g_string_append(ctxt->feed->parseErrors, _("Source points to HTML document."));
228 } else {
229 debug0(DEBUG_UPDATE, "neither a known feed type nor a HTML document!");
230 g_string_append(ctxt->feed->parseErrors, _("Could not determine the feed type."));
231 }
232 } else {
233 debug1(DEBUG_UPDATE, "discovered feed format: %s", feed_type_fhp_to_str(ctxt->feed->fhp));
234 success = TRUE;
235 }
236
237 if(ctxt->doc) {
238 xmlFreeDoc(ctxt->doc);
239 ctxt->doc = NULL;
240 }
241
242 debug_exit("feed_parse");
243
244 return success;
245 }
246