1 /**
2  * @file atom10.c  Atom 1.0 Parser
3  *
4  * Copyright (C) 2005-2006 Nathan Conrad <t98502@users.sourceforge.net>
5  * Copyright (C) 2003-2014 Lars Windolf <lars.windolf@gmx.de>
6  *
7  * This program is free software; you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation; either version 2 of the License, or
10  * (at your option) any later version.
11  *
12  * This program is distributed in the hope that it will be useful,
13  * but WITHOUT ANY WARRANTY; without even the implied warranty of
14  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
15  * GNU General Public License for more details.
16  *
17  * You should have received a copy of the GNU General Public License
18  * along with this program; if not, write to the Free Software
19  * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA  02111-1307  USA
20  */
21 
22 #include "atom10.h"
23 
24 #include <string.h>
25 
26 #include "common.h"
27 #include "date.h"
28 #include "debug.h"
29 #include "enclosure.h"
30 #include "feed_parser.h"
31 #include "feedlist.h"
32 #include "ns_admin.h"
33 #include "ns_ag.h"
34 #include "ns_blogChannel.h"
35 #include "ns_cC.h"
36 #include "ns_content.h"
37 #include "ns_dc.h"
38 #include "ns_georss.h"
39 #include "ns_itunes.h"
40 #include "ns_photo.h"
41 #include "ns_media.h"
42 #include "ns_slash.h"
43 #include "ns_syn.h"
44 #include "ns_trackback.h"
45 #include "ns_wfw.h"
46 #include "metadata.h"
47 #include "subscription.h"
48 #include "xml.h"
49 
50 #define ATOM10_NS BAD_CAST"http://www.w3.org/2005/Atom"
51 
52 /* to store the ATOMNsHandler structs for all supported RDF namespace handlers */
53 GHashTable	*atom10_nstable = NULL;
54 GHashTable	*ns_atom10_ns_uri_table = NULL;
55 struct atom10ParserState {
56 	gboolean errorDetected;
57 };
58 typedef void 	(*atom10ElementParserFunc)	(xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state);
59 
60 static gchar *
atom10_mark_up_text_content(gchar * content)61 atom10_mark_up_text_content (gchar* content)
62 {
63 	gchar **tokens;
64 	gchar **token;
65 	gchar *str, *old_str;
66 
67 	if (!content)
68 		return NULL;
69 	if (!*content)
70 		return g_strdup (content);
71 
72 	tokens = g_strsplit (content, "\n\n", 0);
73 
74 	if (!tokens[0]) { /* No tokens */
75 		str = g_strdup("");
76 	} else if (!tokens[1]) { /* One token */
77 		str = g_markup_escape_text (tokens[0], -1);
78 	} else { /* Many tokens */
79 		token = tokens;
80 		while (*token) {
81 			old_str = *token;
82 			str = g_strchug (g_strchomp (*token)); /* WARNING: modifies the token string*/
83 			if (str[0] != '\0') {
84 				*token = g_markup_printf_escaped ("<p>%s</p>", str);
85 				g_free (old_str);
86 			} else {
87 				**token = '\0'; /* Erase the particular token because it is blank */
88 			}
89 			token++;
90 		}
91 		str = g_strjoinv ("\n", tokens);
92 	}
93 	g_strfreev (tokens);
94 
95 	return str;
96 }
97 
98 /**
99  * This parses an Atom content construct.
100  *
101  * @param cur	the XML node to be parsed
102  * @param ctxt 	a valid feed parser context
103  * @returns g_strduped string which must be freed by the caller.
104  */
105 static gchar *
atom10_parse_content_construct(xmlNodePtr cur,feedParserCtxtPtr ctxt)106 atom10_parse_content_construct (xmlNodePtr cur, feedParserCtxtPtr ctxt)
107 {
108 	gchar *ret = NULL;
109 
110 	if (xmlHasNsProp (cur, BAD_CAST"src", NULL )) {
111 		/*
112 		   RFC 4287 says a feed must have a summary when there's
113 		   a src attribute in the content (and the content therefore
114 		   empty). We are already parsing the summary separately.
115 
116 		   RFC 4287 also says an entry must contain one link element
117 		   with rel="alternate", so there's no point in parsing
118 		   src and setting it as link.
119 		*/
120 		ret = NULL;
121 	} else {
122 		gchar *type;
123 
124 		/* determine encoding mode */
125 		type = xml_get_ns_attribute (cur, "type", NULL);
126 
127 		/* Contents need to be de-encoded and should not contain sub-tags.*/
128 		if (type && (g_str_equal (type,"html") || !g_ascii_strcasecmp (type, "text/html"))) {
129 			ret = xhtml_extract (cur, 0, NULL);
130 		} else if (!type || !strcmp (type, "text") || !strncasecmp (type, "text/",5)) {
131 			gchar *tmp;
132 			/* Assume that "text/ *" files can be directly displayed.. kinda stated in the RFC */
133 			ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
134 
135 			g_strchug (g_strchomp (ret));
136 
137 			if (!type || !strcasecmp (type, "text"))
138 				tmp = atom10_mark_up_text_content (ret);
139 			else
140 				tmp = g_markup_printf_escaped ("<pre>%s</pre>", ret);
141 			g_free (ret);
142 			ret = tmp;
143 		} else if (!strcmp(type,"xhtml") || !g_ascii_strcasecmp (type, "application/xhtml+xml")) {
144 			/* The spec says to only show the contents of the div tag that MUST be present */
145 			ret = xhtml_extract (cur, 2, NULL);
146 		} else {
147 			/* Do nothing on unsupported content types. This allows summaries to be used. */
148 			ret = NULL;
149 		}
150 
151 		g_free (type);
152 	}
153 
154 	return ret;
155 }
156 
157 /**
158  * Parse Atom 1.0 text tags of all sorts.
159  *
160  * @param htmlified	If set to 1, then HTML is returned.
161  * 			When set to 0, All HTML tags are removed
162  *
163  * @returns an escaped version of a text construct.
164  */
165 static gchar *
atom10_parse_text_construct(xmlNodePtr cur,gboolean htmlified)166 atom10_parse_text_construct (xmlNodePtr cur, gboolean htmlified)
167 {
168 	gchar	*type, *tmp, *ret = NULL;
169 
170 	/* determine encoding mode */
171 	type = xml_get_ns_attribute (cur, "type", NULL);
172 
173 	/* not sure what MIME types are necessary... */
174 
175 	/* This that need to be de-encoded and should not contain sub-tags.*/
176 	if (!type || !strcmp(type, "text")) {
177 		ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
178 		if (ret) {
179 			g_strchug (g_strchomp (ret));
180 
181 			if (htmlified) {
182 				tmp = atom10_mark_up_text_content (ret);
183 				g_free (ret);
184 				ret = tmp;
185 			}
186 		}
187 	} else if (!strcmp(type, "html")) {
188 		ret = xhtml_extract (cur, 0, NULL);
189 		if (!htmlified)
190 			ret = unhtmlize (unxmlize (ret));
191 	} else if (!strcmp (type, "xhtml")) {
192 		/* The spec says to show the contents of the div tag that MUST be present */
193 		ret = xhtml_extract (cur, 2, NULL);
194 
195 		if (!htmlified)
196 			ret = unhtmlize (ret);
197 	} else {
198 		/* Invalid Atom feed */
199 		ret = g_strdup ("This attribute was invalidly specified in this Atom feed.");
200 	}
201 
202 	g_free (type);
203 
204 	return ret;
205 }
206 
207 static gchar *
atom10_parse_person_construct(xmlNodePtr cur)208 atom10_parse_person_construct (xmlNodePtr cur)
209 {
210 	gchar	*tmp = NULL;
211 	gchar	*name = NULL, *uri = NULL, *email = NULL;
212 	gboolean invalid = FALSE;
213 
214 	cur = cur->xmlChildrenNode;
215 	while (cur) {
216 		if (NULL == cur->name || cur->type != XML_ELEMENT_NODE || cur->ns == NULL || cur->ns->href == NULL) {
217 			cur = cur->next;
218 			continue;
219 		}
220 
221 		if (xmlStrEqual (cur->ns->href, ATOM10_NS)) {
222 			if (xmlStrEqual (cur->name, BAD_CAST"name")) {
223 				g_free (name);
224 				name = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
225 			}
226 
227 			if (xmlStrEqual (cur->name, BAD_CAST"email")) {
228 				if (email)
229 					invalid = TRUE;
230 				g_free(email);
231 				tmp = (gchar *)xmlNodeListGetString(cur->doc, cur->xmlChildrenNode, 1);
232 				email = g_markup_printf_escaped (" - <a href=\"mailto:%s\">%s</a>", tmp, tmp);
233 				g_free(tmp);
234 			}
235 
236 			if (xmlStrEqual(cur->name, BAD_CAST"uri")) {
237 				if (uri)
238 					invalid = TRUE;
239 				g_free (uri);
240 				tmp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
241 				uri = g_markup_printf_escaped (" (<a href=\"%s\">%s</a>)", tmp, _("Website"));
242 				g_free (tmp);
243 			}
244 		} else {
245 			/* FIXME: handle extension elements here */
246 		}
247 		cur = cur->next;
248 	}
249 
250 	if (!name)
251 		invalid = TRUE;
252 
253 	if (!invalid)
254 		tmp = g_strdup_printf ("%s%s%s", name, uri?uri:"", email?email:"");
255 	else
256 		tmp = NULL;
257 
258 	g_free (uri);
259 	g_free (email);
260 	g_free (name);
261 	return tmp;
262 }
263 
264 /* Note: this function is called for both item and feed context */
265 static gchar *
atom10_parse_link(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)266 atom10_parse_link (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
267 {
268 	gchar *href, *alternate = NULL;
269 
270 	href = xml_get_ns_attribute (cur, "href", NULL);
271 	if (href) {
272 		xmlChar *baseURL = xmlNodeGetBase (cur->doc, cur);
273 		gchar *url, *relation, *type, *escTitle = NULL, *title;
274 		const gchar *feedURL = subscription_get_homepage (ctxt->subscription);
275 
276 		if (!baseURL && feedURL && feedURL[0] != '|' && strstr (feedURL, "://"))
277 			baseURL = xmlStrdup (BAD_CAST (feedURL));
278 		url = (gchar *)common_build_url (href, (gchar *)baseURL);
279 
280 		type = xml_get_ns_attribute (cur, "type", NULL);
281 		relation = xml_get_ns_attribute (cur, "rel", NULL);
282 		title = xml_get_ns_attribute (cur, "title", NULL);
283 		if (title)
284 			escTitle = g_markup_escape_text (title, -1);
285 
286 		if (!xmlHasNsProp (cur, BAD_CAST"rel", NULL) || !relation || g_str_equal (relation, BAD_CAST"alternate")) {
287 			alternate = g_strdup (url);
288 		} else if (g_str_equal (relation, "self")) {
289 			alternate = g_strdup (url);
290 		} else if (g_str_equal (relation, "replies")) {
291 			if (!type || g_str_equal (type, BAD_CAST"application/atom+xml")) {
292 				gchar *commentUri = (gchar *)common_build_url ((gchar *)url, subscription_get_homepage (ctxt->subscription));
293 				if (ctxt->item)
294 					metadata_list_set (&ctxt->item->metadata, "commentFeedUri", commentUri);
295 				g_free (commentUri);
296 			}
297 		} else if (g_str_equal (relation, "enclosure")) {
298 			if (ctxt->item) {
299 				gsize length = 0;
300 				gchar *lengthStr = xml_get_ns_attribute (cur, "length", NULL);
301 				if (lengthStr)
302 					length = atol (lengthStr);
303 				g_free (lengthStr);
304 
305 				gchar *encStr = enclosure_values_to_string (url, type, length, FALSE /* not yet downloaded */);
306 				ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "enclosure", encStr);
307 				ctxt->item->hasEnclosure = TRUE;
308 				g_free (encStr);
309 			}
310 		} else if (g_str_equal (relation, "related") || g_str_equal (relation, "via")) {
311 			if (ctxt->item)
312 				ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, relation, url);
313 		} else {
314 			/* g_warning ("Unhandled Atom link with unexpected relation \"%s\"\n", relation); */
315 		}
316 		xmlFree (title);
317 		xmlFree (baseURL);
318 		g_free (escTitle);
319 		g_free (url);
320 		g_free(relation);
321 		g_free(type);
322 		g_free(href);
323 	} else {
324 		/* FIXME: @href is required, this document is not valid Atom */;
325 	}
326 
327 	return alternate;
328 }
329 
330 static void
atom10_parse_entry_author(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)331 atom10_parse_entry_author (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
332 {
333 	gchar *author;
334 
335 	author = atom10_parse_person_construct (cur);
336 	if (author) {
337 		ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "author", author);
338 		g_free (author);
339 	}
340 }
341 
342 static void
atom10_parse_entry_category(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)343 atom10_parse_entry_category (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
344 {
345 	gchar *category = NULL;
346 
347 	category = xml_get_ns_attribute (cur, "label", NULL);
348 	if (!category)
349 		category = xml_get_ns_attribute (cur, "term", NULL);
350 
351 	if (category) {
352 		gchar *escaped = g_markup_escape_text (category, -1);
353 
354 		/* Black-list some categories used by Google Reader clone online
355 		   readers that should not be visible to the end-user */
356 		if (!g_str_equal (category, "reading-list") &&
357 		    !g_str_equal (category, "read") &&
358 		    !strstr(category, "user/-/label/"))
359 			ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "category", escaped);
360 
361 		g_free (escaped);
362 		xmlFree (category);
363 	}
364 }
365 
366 static void
atom10_parse_entry_content(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)367 atom10_parse_entry_content (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
368 {
369 	gchar *content;
370 
371 	content = atom10_parse_content_construct (cur, ctxt);
372 	if (content) {
373 		item_set_description (ctxt->item, content);
374 		g_free (content);
375 	}
376 }
377 
378 static void
atom10_parse_entry_contributor(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)379 atom10_parse_entry_contributor (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
380 {
381 	gchar *contributor;
382 
383 	contributor = atom10_parse_person_construct (cur);
384 	if (contributor) {
385 		ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "contributor", contributor);
386 		g_free (contributor);
387 	}
388 }
389 
390 static void
atom10_parse_entry_id(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)391 atom10_parse_entry_id (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
392 {
393 	gchar *id;
394 
395 	id = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
396 	if (id) {
397 		if (strlen (id) > 0) {
398 			item_set_id (ctxt->item, id);
399 			ctxt->item->validGuid = TRUE;
400 		}
401 		g_free (id);
402 	}
403 }
404 
405 static void
atom10_parse_entry_link(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)406 atom10_parse_entry_link (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
407 {
408 	gchar *href;
409 
410 	href = atom10_parse_link (cur, ctxt, state);
411 	if (href) {
412 		item_set_source (ctxt->item, href);
413 		g_free (href);
414 	}
415 }
416 
417 static void
atom10_parse_entry_published(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)418 atom10_parse_entry_published (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
419 {
420 	gchar *datestr;
421 
422 	datestr = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
423 	if (datestr) {
424 		ctxt->item->time = date_parse_ISO8601 (datestr);
425 		ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "pubDate", datestr);
426 		g_free (datestr);
427 	}
428 }
429 
430 static void
atom10_parse_entry_rights(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)431 atom10_parse_entry_rights (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
432 {
433 	gchar *rights;
434 
435 	rights = atom10_parse_text_construct (cur, FALSE);
436 	if (rights) {
437 		ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "copyright", rights);
438 		g_free (rights);
439 	}
440 }
441 
442 /* <summary> can be used for short text descriptions, if there is no
443    <content> description we show the <summary> content */
444 static void
atom10_parse_entry_summary(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)445 atom10_parse_entry_summary (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
446 {
447 	gchar *summary;
448 
449 	summary = atom10_parse_text_construct (cur, TRUE);
450 	if (summary) {
451 		item_set_description (ctxt->item, summary);
452 		g_free (summary);
453 	}
454 	/* FIXME: set a flag to show a "Read more" link to the user; but where? */
455 }
456 
457 static void
atom10_parse_entry_title(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)458 atom10_parse_entry_title (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
459 {
460 	gchar *title;
461 
462 	title = atom10_parse_text_construct(cur, FALSE);
463 	if (title) {
464 		item_set_title (ctxt->item, title);
465 		g_free (title);
466 	}
467 }
468 
469 static void
atom10_parse_entry_updated(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)470 atom10_parse_entry_updated (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
471 {
472 	gchar *datestr;
473 
474 	datestr = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
475 	/* if pubDate is already set, don't overwrite it */
476 	if (datestr && !metadata_list_get(ctxt->item->metadata, "pubDate")) {
477 		ctxt->item->time = date_parse_ISO8601 (datestr);
478 		ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "contentUpdateDate", datestr);
479 	}
480 
481 	g_free (datestr);
482 }
483 
484 /* <content> tag support, FIXME: base64 not supported */
485 /* method to parse standard tags for each item element */
486 static itemPtr
atom10_parse_entry(feedParserCtxtPtr ctxt,xmlNodePtr cur)487 atom10_parse_entry (feedParserCtxtPtr ctxt, xmlNodePtr cur)
488 {
489 	NsHandler		*nsh;
490 	parseItemTagFunc	pf;
491 	atom10ElementParserFunc func;
492 	static GHashTable	*entryElementHash = NULL;
493 
494 	if (!entryElementHash) {
495 		entryElementHash = g_hash_table_new (g_str_hash, g_str_equal);
496 
497 		g_hash_table_insert (entryElementHash, "author", &atom10_parse_entry_author);
498 		g_hash_table_insert (entryElementHash, "category", &atom10_parse_entry_category);
499 		g_hash_table_insert (entryElementHash, "content", &atom10_parse_entry_content);
500 		g_hash_table_insert (entryElementHash, "contributor", &atom10_parse_entry_contributor);
501 		g_hash_table_insert (entryElementHash, "id", &atom10_parse_entry_id);
502 		g_hash_table_insert (entryElementHash, "link", &atom10_parse_entry_link);
503 		g_hash_table_insert (entryElementHash, "published", &atom10_parse_entry_published);
504 		g_hash_table_insert (entryElementHash, "rights", &atom10_parse_entry_rights);
505 		/* FIXME: Parse "source" */
506 		g_hash_table_insert (entryElementHash, "summary", &atom10_parse_entry_summary);
507 		g_hash_table_insert (entryElementHash, "title", &atom10_parse_entry_title);
508 		g_hash_table_insert (entryElementHash, "updated", &atom10_parse_entry_updated);
509 	}
510 
511 	ctxt->item = item_new ();
512 
513 	cur = cur->xmlChildrenNode;
514 	while (cur) {
515 
516 		if (cur->type != XML_ELEMENT_NODE || cur->name == NULL || cur->ns == NULL) {
517 			cur = cur->next;
518 			continue;
519 		}
520 
521 		if ((cur->ns->href   && (nsh = (NsHandler *)g_hash_table_lookup (ns_atom10_ns_uri_table, (gpointer)cur->ns->href))) ||
522 		    (cur->ns->prefix && (nsh = (NsHandler *)g_hash_table_lookup (atom10_nstable, (gpointer)cur->ns->prefix)))) {
523 
524 			pf = nsh->parseItemTag;
525 			if (pf)
526 				(*pf) (ctxt, cur);
527 			cur = cur->next;
528 			continue;
529 		}
530 
531 		/* check namespace of this tag */
532 		if (!cur->ns->href) {
533 			/* This is an invalid feed... no idea what to do with the current element */
534 			debug1 (DEBUG_PARSING, "element with no namespace found in atom feed (%s)!", cur->name);
535 			cur = cur->next;
536 			continue;
537 		}
538 
539 
540 		if (xmlStrcmp(cur->ns->href, ATOM10_NS)) {
541 			debug1(DEBUG_PARSING, "unknown namespace %s found!", cur->ns->href);
542 			cur = cur->next;
543 			continue;
544 		}
545 		/* At this point, the namespace must be the Atom 1.0 namespace */
546 		func = g_hash_table_lookup (entryElementHash, cur->name);
547 		if (func) {
548 			(*func) (cur, ctxt, NULL);
549 		} else {
550 			debug1 (DEBUG_PARSING, "unknown entry element \"%s\" found", cur->name);
551 		}
552 
553 		cur = cur->next;
554 	}
555 
556 	/* after parsing we fill the infos into the itemPtr structure */
557 	ctxt->item->readStatus = FALSE;
558 
559 	if (0 == ctxt->item->time)
560 		ctxt->item->time = ctxt->feed->time;
561 
562 	return ctxt->item;
563 }
564 
565 static void
atom10_parse_feed_author(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)566 atom10_parse_feed_author (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
567 {
568 	/* parse feed author */
569 	gchar *author = atom10_parse_person_construct (cur);
570 	if (author) {
571 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "author", author);
572 		g_free (author);
573 	}
574 	/* FIXME: make item parsing use this author if not specified elsewhere */
575 }
576 
577 static void
atom10_parse_feed_category(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)578 atom10_parse_feed_category (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
579 {
580 	gchar *label = NULL;
581 
582 	label = xml_get_ns_attribute (cur, "label", NULL);
583 	if (!label)
584 		label = xml_get_ns_attribute (cur, "term", NULL);
585 
586 	if (label) {
587 		gchar *escaped = g_markup_escape_text (label, -1);
588 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "category", escaped);
589 		g_free (escaped);
590 		xmlFree (label);
591 	}
592 }
593 
594 static void
atom10_parse_feed_contributor(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)595 atom10_parse_feed_contributor (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
596 {
597 	/* parse feed contributors */
598 	gchar *contributer = atom10_parse_person_construct (cur);
599 	if (contributer) {
600 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "contributor", contributer);
601 		g_free (contributer);
602 	}
603 }
604 
605 static void
atom10_parse_feed_generator(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)606 atom10_parse_feed_generator (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
607 {
608 	gchar *ret, *version, *tmp = NULL, *uri;
609 
610 	ret = unhtmlize ((gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1));
611 	if (ret && ret[0] != '\0') {
612 		version = xml_get_ns_attribute (cur, "version", NULL);
613 		if (version) {
614 			tmp = g_strdup_printf ("%s %s", ret, version);
615 			g_free (ret);
616 			g_free (version);
617 			ret = tmp;
618 		}
619 		uri = xml_get_ns_attribute (cur, "uri", NULL);
620 		if (uri) {
621 			tmp = g_markup_printf_escaped ("<a href=\"%s\">%s</a>", uri, ret);
622 			g_free (uri);
623 			g_free (ret);
624 			ret = tmp;
625 		}
626 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "feedgenerator", tmp);
627 	}
628 	g_free (ret);
629 }
630 
631 static void
atom10_parse_feed_icon(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)632 atom10_parse_feed_icon (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
633 {
634 	gchar *icon_uri;
635 
636 	icon_uri = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
637 
638 	if (icon_uri) {
639 		debug1 (DEBUG_PARSING, "icon URI found in atom feed: %s", icon_uri);
640 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata,
641 								     "icon", icon_uri);
642 	}
643 }
644 
645 static void
atom10_parse_feed_id(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)646 atom10_parse_feed_id (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
647 {
648 	/* FIXME: Parse ID, but I'm not sure where Liferea would use it */
649 }
650 
651 static void
atom10_parse_feed_link(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)652 atom10_parse_feed_link (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
653 {
654 	gchar *href;
655 
656 	href = atom10_parse_link (cur, ctxt, state);
657 	if (href) {
658 		xmlChar *baseURL = xmlNodeGetBase (cur->doc, xmlDocGetRootElement (cur->doc));
659 
660 		subscription_set_homepage (ctxt->subscription, href);
661 		/* Set the default base to the feed's HTML URL if not set yet */
662 		if (baseURL == NULL)
663 			xmlNodeSetBase (xmlDocGetRootElement (cur->doc), (xmlChar *)href);
664 		else xmlFree (baseURL);
665 		g_free (href);
666 	}
667 }
668 
669 static void
atom10_parse_feed_logo(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)670 atom10_parse_feed_logo (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
671 {
672 	gchar *logoUrl;
673 
674 	logoUrl = atom10_parse_text_construct (cur, FALSE);
675 	if (logoUrl) {
676 		metadata_list_set (&ctxt->subscription->metadata, "imageUrl", logoUrl);
677 		g_free (logoUrl);
678 	}
679 }
680 
681 static void
atom10_parse_feed_rights(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)682 atom10_parse_feed_rights (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
683 {
684 	gchar *rights;
685 
686 	rights = atom10_parse_text_construct (cur, FALSE);
687 	if (rights) {
688 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "copyright", rights);
689 		g_free (rights);
690 	}
691 }
692 
693 static void
atom10_parse_feed_subtitle(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)694 atom10_parse_feed_subtitle (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
695 {
696 	gchar *subtitle;
697 
698 	subtitle = atom10_parse_text_construct (cur, TRUE);
699 	if (subtitle) {
700  		metadata_list_set (&ctxt->subscription->metadata, "description", subtitle);
701 		g_free (subtitle);
702 	}
703 }
704 
705 static void
atom10_parse_feed_title(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)706 atom10_parse_feed_title (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
707 {
708 	gchar *title;
709 
710 	title = atom10_parse_text_construct(cur, FALSE);
711 	if (title) {
712 		if (ctxt->title)
713 			g_free (ctxt->title);
714 		ctxt->title = title;
715 	}
716 }
717 
718 /* Sort items in descending date order (newer items first). */
719 static gint
atom10_item_sort_by_date(gconstpointer a,gconstpointer b)720 atom10_item_sort_by_date (gconstpointer a, gconstpointer b)
721 {
722 	itemPtr item1 = (itemPtr)a;
723 	itemPtr item2 = (itemPtr)b;
724 
725 	g_assert (item1 && item2);
726 
727 	if (item1->time == item2->time) {
728 		/* Items identical.. can we distinguish further? */
729 		return 0;
730 	}
731 
732 	if (item1->time < item2->time)
733 		return 1;
734 	if (item1->time > item2->time)
735 		return -1;
736 
737 	return 0;
738 }
739 
740 static void
atom10_parse_feed_updated(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)741 atom10_parse_feed_updated (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
742 {
743 	gchar *timestamp;
744 
745 	timestamp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
746 	if (timestamp) {
747 		ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "contentUpdateDate", timestamp);
748 		ctxt->feed->time = date_parse_ISO8601 (timestamp);
749 		g_free (timestamp);
750 	}
751 }
752 
753 /* reads a Atom feed URL and returns a new channel structure (even if
754    the feed could not be read) */
755 static void
atom10_parse_feed(feedParserCtxtPtr ctxt,xmlNodePtr cur)756 atom10_parse_feed (feedParserCtxtPtr ctxt, xmlNodePtr cur)
757 {
758 	NsHandler		*nsh;
759 	parseChannelTagFunc	pf;
760 	atom10ElementParserFunc func;
761 	static GHashTable	*feedElementHash = NULL;
762 
763 	if(!feedElementHash) {
764 		feedElementHash = g_hash_table_new (g_str_hash, g_str_equal);
765 
766 		g_hash_table_insert (feedElementHash, "author", &atom10_parse_feed_author);
767 		g_hash_table_insert (feedElementHash, "category", &atom10_parse_feed_category);
768 		g_hash_table_insert (feedElementHash, "contributor", &atom10_parse_feed_contributor);
769 		g_hash_table_insert (feedElementHash, "generator", &atom10_parse_feed_generator);
770 		g_hash_table_insert (feedElementHash, "icon", &atom10_parse_feed_icon);
771 		g_hash_table_insert (feedElementHash, "id", &atom10_parse_feed_id);
772 		g_hash_table_insert (feedElementHash, "link", &atom10_parse_feed_link);
773 		g_hash_table_insert (feedElementHash, "logo", &atom10_parse_feed_logo);
774 		g_hash_table_insert (feedElementHash, "rights", &atom10_parse_feed_rights);
775 		g_hash_table_insert (feedElementHash, "subtitle", &atom10_parse_feed_subtitle);
776 		g_hash_table_insert (feedElementHash, "title", &atom10_parse_feed_title);
777 		g_hash_table_insert (feedElementHash, "updated", &atom10_parse_feed_updated);
778 	}
779 
780 	while (TRUE) {
781 		if (xmlStrcmp (cur->name, BAD_CAST"feed")) {
782 			g_string_append (ctxt->feed->parseErrors, "<p>Could not find Atom 1.0 header!</p>");
783 			break;
784 		}
785 
786 		/* parse feed contents */
787 		cur = cur->xmlChildrenNode;
788 		while (cur) {
789 		 	if (!cur->name || cur->type != XML_ELEMENT_NODE || !cur->ns) {
790 				cur = cur->next;
791 				continue;
792 			}
793 
794 			/* check if supported namespace should handle the current tag
795 			   by trying to determine a namespace handler */
796 
797 			nsh = NULL;
798 
799 			if (cur->ns->href)
800 				nsh = (NsHandler *)g_hash_table_lookup (ns_atom10_ns_uri_table, (gpointer)cur->ns->href);
801 
802 			if (cur->ns->prefix && !nsh)
803 				nsh = (NsHandler *)g_hash_table_lookup (atom10_nstable, (gpointer)cur->ns->prefix);
804 
805 			if(nsh) {
806 				pf = nsh->parseChannelTag;
807 				if(pf)
808 					(*pf)(ctxt, cur);
809 				cur = cur->next;
810 				continue;
811 			}
812 
813 			/* check namespace of this tag */
814 			if (!cur->ns->href) {
815 				/* This is an invalid feed... no idea what to do with the current element */
816 				debug1 (DEBUG_PARSING, "element with no namespace found in atom feed (%s)!", cur->name);
817 				cur = cur->next;
818 				continue;
819 			}
820 
821 			if (xmlStrcmp (cur->ns->href, ATOM10_NS)) {
822 				debug1 (DEBUG_PARSING, "unknown namespace %s found in atom feed!", cur->ns->href);
823 				cur = cur->next;
824 				continue;
825 			}
826 			/* At this point, the namespace must be the Atom 1.0 namespace */
827 
828 			func = g_hash_table_lookup (feedElementHash, cur->name);
829 			if (func) {
830 				(*func) (cur, ctxt, NULL);
831 			} else if (xmlStrEqual (cur->name, BAD_CAST"entry")) {
832 				ctxt->item = atom10_parse_entry (ctxt, cur);
833 				if (ctxt->item)
834 					ctxt->items = g_list_insert_sorted (ctxt->items, ctxt->item, atom10_item_sort_by_date);
835 			}
836 			cur = cur->next;
837 		}
838 
839 		/* FIXME: Maybe check to see that the required information was actually provided (persuant to the RFC). */
840 		/* after parsing we fill in the infos into the feedPtr structure */
841 
842 		break;
843 	}
844 }
845 
846 static gboolean
atom10_format_check(xmlDocPtr doc,xmlNodePtr cur)847 atom10_format_check (xmlDocPtr doc, xmlNodePtr cur)
848 {
849 	if (cur->name == NULL || cur->ns == NULL || cur->ns->href == NULL)
850 		return FALSE;
851 	return xmlStrEqual (cur->name, BAD_CAST"feed") && xmlStrEqual (cur->ns->href, ATOM10_NS);
852 }
853 
854 static void
atom10_add_ns_handler(NsHandler * handler)855 atom10_add_ns_handler (NsHandler *handler)
856 {
857 	g_assert (NULL != atom10_nstable);
858 	g_hash_table_insert (atom10_nstable, (gpointer)handler->prefix, handler);
859 	g_assert (handler->registerNs != NULL);
860 	handler->registerNs (handler, atom10_nstable, ns_atom10_ns_uri_table);
861 }
862 
863 feedHandlerPtr
atom10_init_feed_handler(void)864 atom10_init_feed_handler (void)
865 {
866 	feedHandlerPtr	fhp;
867 
868 	fhp = g_new0 (struct feedHandler, 1);
869 
870 	if (!atom10_nstable) {
871 		atom10_nstable = g_hash_table_new (g_str_hash, g_str_equal);
872 		ns_atom10_ns_uri_table = g_hash_table_new (g_str_hash, g_str_equal);
873 
874 		/* register name space handlers */
875 		atom10_add_ns_handler (ns_bC_get_handler ());
876 		atom10_add_ns_handler (ns_dc_get_handler ());
877   		atom10_add_ns_handler (ns_slash_get_handler ());
878 		atom10_add_ns_handler (ns_content_get_handler ());
879 		atom10_add_ns_handler (ns_syn_get_handler ());
880 		atom10_add_ns_handler (ns_admin_get_handler ());
881 		atom10_add_ns_handler (ns_ag_get_handler ());
882 		atom10_add_ns_handler (ns_cC_get_handler ());
883 		atom10_add_ns_handler (ns_photo_get_handler ());
884 		atom10_add_ns_handler (ns_pb_get_handler ());
885 		atom10_add_ns_handler (ns_wfw_get_handler ());
886 		atom10_add_ns_handler (ns_media_get_handler ());
887 		atom10_add_ns_handler (ns_trackback_get_handler ());
888 		atom10_add_ns_handler (ns_georss_get_handler ());
889 	}
890 	/* prepare feed handler structure */
891 	fhp->typeStr = "atom";
892 	fhp->feedParser	= atom10_parse_feed;
893 	fhp->checkFormat = atom10_format_check;
894 
895 	return fhp;
896 }
897