1 /**
2 * @file atom10.c Atom 1.0 Parser
3 *
4 * Copyright (C) 2005-2006 Nathan Conrad <t98502@users.sourceforge.net>
5 * Copyright (C) 2003-2014 Lars Windolf <lars.windolf@gmx.de>
6 *
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
11 *
12 * This program is distributed in the hope that it will be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
16 *
17 * You should have received a copy of the GNU General Public License
18 * along with this program; if not, write to the Free Software
19 * Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
20 */
21
22 #include "atom10.h"
23
24 #include <string.h>
25
26 #include "common.h"
27 #include "date.h"
28 #include "debug.h"
29 #include "enclosure.h"
30 #include "feed_parser.h"
31 #include "feedlist.h"
32 #include "ns_admin.h"
33 #include "ns_ag.h"
34 #include "ns_blogChannel.h"
35 #include "ns_cC.h"
36 #include "ns_content.h"
37 #include "ns_dc.h"
38 #include "ns_georss.h"
39 #include "ns_itunes.h"
40 #include "ns_photo.h"
41 #include "ns_media.h"
42 #include "ns_slash.h"
43 #include "ns_syn.h"
44 #include "ns_trackback.h"
45 #include "ns_wfw.h"
46 #include "metadata.h"
47 #include "subscription.h"
48 #include "xml.h"
49
50 #define ATOM10_NS BAD_CAST"http://www.w3.org/2005/Atom"
51
52 /* to store the ATOMNsHandler structs for all supported RDF namespace handlers */
53 GHashTable *atom10_nstable = NULL;
54 GHashTable *ns_atom10_ns_uri_table = NULL;
55 struct atom10ParserState {
56 gboolean errorDetected;
57 };
58 typedef void (*atom10ElementParserFunc) (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state);
59
60 static gchar *
atom10_mark_up_text_content(gchar * content)61 atom10_mark_up_text_content (gchar* content)
62 {
63 gchar **tokens;
64 gchar **token;
65 gchar *str, *old_str;
66
67 if (!content)
68 return NULL;
69 if (!*content)
70 return g_strdup (content);
71
72 tokens = g_strsplit (content, "\n\n", 0);
73
74 if (!tokens[0]) { /* No tokens */
75 str = g_strdup("");
76 } else if (!tokens[1]) { /* One token */
77 str = g_markup_escape_text (tokens[0], -1);
78 } else { /* Many tokens */
79 token = tokens;
80 while (*token) {
81 old_str = *token;
82 str = g_strchug (g_strchomp (*token)); /* WARNING: modifies the token string*/
83 if (str[0] != '\0') {
84 *token = g_markup_printf_escaped ("<p>%s</p>", str);
85 g_free (old_str);
86 } else {
87 **token = '\0'; /* Erase the particular token because it is blank */
88 }
89 token++;
90 }
91 str = g_strjoinv ("\n", tokens);
92 }
93 g_strfreev (tokens);
94
95 return str;
96 }
97
98 /**
99 * This parses an Atom content construct.
100 *
101 * @param cur the XML node to be parsed
102 * @param ctxt a valid feed parser context
103 * @returns g_strduped string which must be freed by the caller.
104 */
105 static gchar *
atom10_parse_content_construct(xmlNodePtr cur,feedParserCtxtPtr ctxt)106 atom10_parse_content_construct (xmlNodePtr cur, feedParserCtxtPtr ctxt)
107 {
108 gchar *ret = NULL;
109
110 if (xmlHasNsProp (cur, BAD_CAST"src", NULL )) {
111 /*
112 RFC 4287 says a feed must have a summary when there's
113 a src attribute in the content (and the content therefore
114 empty). We are already parsing the summary separately.
115
116 RFC 4287 also says an entry must contain one link element
117 with rel="alternate", so there's no point in parsing
118 src and setting it as link.
119 */
120 ret = NULL;
121 } else {
122 gchar *type;
123
124 /* determine encoding mode */
125 type = xml_get_ns_attribute (cur, "type", NULL);
126
127 /* Contents need to be de-encoded and should not contain sub-tags.*/
128 if (type && (g_str_equal (type,"html") || !g_ascii_strcasecmp (type, "text/html"))) {
129 ret = xhtml_extract (cur, 0, NULL);
130 } else if (!type || !strcmp (type, "text") || !strncasecmp (type, "text/",5)) {
131 gchar *tmp;
132 /* Assume that "text/ *" files can be directly displayed.. kinda stated in the RFC */
133 ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
134
135 g_strchug (g_strchomp (ret));
136
137 if (!type || !strcasecmp (type, "text"))
138 tmp = atom10_mark_up_text_content (ret);
139 else
140 tmp = g_markup_printf_escaped ("<pre>%s</pre>", ret);
141 g_free (ret);
142 ret = tmp;
143 } else if (!strcmp(type,"xhtml") || !g_ascii_strcasecmp (type, "application/xhtml+xml")) {
144 /* The spec says to only show the contents of the div tag that MUST be present */
145 ret = xhtml_extract (cur, 2, NULL);
146 } else {
147 /* Do nothing on unsupported content types. This allows summaries to be used. */
148 ret = NULL;
149 }
150
151 g_free (type);
152 }
153
154 return ret;
155 }
156
157 /**
158 * Parse Atom 1.0 text tags of all sorts.
159 *
160 * @param htmlified If set to 1, then HTML is returned.
161 * When set to 0, All HTML tags are removed
162 *
163 * @returns an escaped version of a text construct.
164 */
165 static gchar *
atom10_parse_text_construct(xmlNodePtr cur,gboolean htmlified)166 atom10_parse_text_construct (xmlNodePtr cur, gboolean htmlified)
167 {
168 gchar *type, *tmp, *ret = NULL;
169
170 /* determine encoding mode */
171 type = xml_get_ns_attribute (cur, "type", NULL);
172
173 /* not sure what MIME types are necessary... */
174
175 /* This that need to be de-encoded and should not contain sub-tags.*/
176 if (!type || !strcmp(type, "text")) {
177 ret = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
178 if (ret) {
179 g_strchug (g_strchomp (ret));
180
181 if (htmlified) {
182 tmp = atom10_mark_up_text_content (ret);
183 g_free (ret);
184 ret = tmp;
185 }
186 }
187 } else if (!strcmp(type, "html")) {
188 ret = xhtml_extract (cur, 0, NULL);
189 if (!htmlified)
190 ret = unhtmlize (unxmlize (ret));
191 } else if (!strcmp (type, "xhtml")) {
192 /* The spec says to show the contents of the div tag that MUST be present */
193 ret = xhtml_extract (cur, 2, NULL);
194
195 if (!htmlified)
196 ret = unhtmlize (ret);
197 } else {
198 /* Invalid Atom feed */
199 ret = g_strdup ("This attribute was invalidly specified in this Atom feed.");
200 }
201
202 g_free (type);
203
204 return ret;
205 }
206
207 static gchar *
atom10_parse_person_construct(xmlNodePtr cur)208 atom10_parse_person_construct (xmlNodePtr cur)
209 {
210 gchar *tmp = NULL;
211 gchar *name = NULL, *uri = NULL, *email = NULL;
212 gboolean invalid = FALSE;
213
214 cur = cur->xmlChildrenNode;
215 while (cur) {
216 if (NULL == cur->name || cur->type != XML_ELEMENT_NODE || cur->ns == NULL || cur->ns->href == NULL) {
217 cur = cur->next;
218 continue;
219 }
220
221 if (xmlStrEqual (cur->ns->href, ATOM10_NS)) {
222 if (xmlStrEqual (cur->name, BAD_CAST"name")) {
223 g_free (name);
224 name = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
225 }
226
227 if (xmlStrEqual (cur->name, BAD_CAST"email")) {
228 if (email)
229 invalid = TRUE;
230 g_free(email);
231 tmp = (gchar *)xmlNodeListGetString(cur->doc, cur->xmlChildrenNode, 1);
232 email = g_markup_printf_escaped (" - <a href=\"mailto:%s\">%s</a>", tmp, tmp);
233 g_free(tmp);
234 }
235
236 if (xmlStrEqual(cur->name, BAD_CAST"uri")) {
237 if (uri)
238 invalid = TRUE;
239 g_free (uri);
240 tmp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
241 uri = g_markup_printf_escaped (" (<a href=\"%s\">%s</a>)", tmp, _("Website"));
242 g_free (tmp);
243 }
244 } else {
245 /* FIXME: handle extension elements here */
246 }
247 cur = cur->next;
248 }
249
250 if (!name)
251 invalid = TRUE;
252
253 if (!invalid)
254 tmp = g_strdup_printf ("%s%s%s", name, uri?uri:"", email?email:"");
255 else
256 tmp = NULL;
257
258 g_free (uri);
259 g_free (email);
260 g_free (name);
261 return tmp;
262 }
263
264 /* Note: this function is called for both item and feed context */
265 static gchar *
atom10_parse_link(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)266 atom10_parse_link (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
267 {
268 gchar *href, *alternate = NULL;
269
270 href = xml_get_ns_attribute (cur, "href", NULL);
271 if (href) {
272 xmlChar *baseURL = xmlNodeGetBase (cur->doc, cur);
273 gchar *url, *relation, *type, *escTitle = NULL, *title;
274 const gchar *feedURL = subscription_get_homepage (ctxt->subscription);
275
276 if (!baseURL && feedURL && feedURL[0] != '|' && strstr (feedURL, "://"))
277 baseURL = xmlStrdup (BAD_CAST (feedURL));
278 url = (gchar *)common_build_url (href, (gchar *)baseURL);
279
280 type = xml_get_ns_attribute (cur, "type", NULL);
281 relation = xml_get_ns_attribute (cur, "rel", NULL);
282 title = xml_get_ns_attribute (cur, "title", NULL);
283 if (title)
284 escTitle = g_markup_escape_text (title, -1);
285
286 if (!xmlHasNsProp (cur, BAD_CAST"rel", NULL) || !relation || g_str_equal (relation, BAD_CAST"alternate")) {
287 alternate = g_strdup (url);
288 } else if (g_str_equal (relation, "self")) {
289 alternate = g_strdup (url);
290 } else if (g_str_equal (relation, "replies")) {
291 if (!type || g_str_equal (type, BAD_CAST"application/atom+xml")) {
292 gchar *commentUri = (gchar *)common_build_url ((gchar *)url, subscription_get_homepage (ctxt->subscription));
293 if (ctxt->item)
294 metadata_list_set (&ctxt->item->metadata, "commentFeedUri", commentUri);
295 g_free (commentUri);
296 }
297 } else if (g_str_equal (relation, "enclosure")) {
298 if (ctxt->item) {
299 gsize length = 0;
300 gchar *lengthStr = xml_get_ns_attribute (cur, "length", NULL);
301 if (lengthStr)
302 length = atol (lengthStr);
303 g_free (lengthStr);
304
305 gchar *encStr = enclosure_values_to_string (url, type, length, FALSE /* not yet downloaded */);
306 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "enclosure", encStr);
307 ctxt->item->hasEnclosure = TRUE;
308 g_free (encStr);
309 }
310 } else if (g_str_equal (relation, "related") || g_str_equal (relation, "via")) {
311 if (ctxt->item)
312 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, relation, url);
313 } else {
314 /* g_warning ("Unhandled Atom link with unexpected relation \"%s\"\n", relation); */
315 }
316 xmlFree (title);
317 xmlFree (baseURL);
318 g_free (escTitle);
319 g_free (url);
320 g_free(relation);
321 g_free(type);
322 g_free(href);
323 } else {
324 /* FIXME: @href is required, this document is not valid Atom */;
325 }
326
327 return alternate;
328 }
329
330 static void
atom10_parse_entry_author(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)331 atom10_parse_entry_author (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
332 {
333 gchar *author;
334
335 author = atom10_parse_person_construct (cur);
336 if (author) {
337 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "author", author);
338 g_free (author);
339 }
340 }
341
342 static void
atom10_parse_entry_category(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)343 atom10_parse_entry_category (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
344 {
345 gchar *category = NULL;
346
347 category = xml_get_ns_attribute (cur, "label", NULL);
348 if (!category)
349 category = xml_get_ns_attribute (cur, "term", NULL);
350
351 if (category) {
352 gchar *escaped = g_markup_escape_text (category, -1);
353
354 /* Black-list some categories used by Google Reader clone online
355 readers that should not be visible to the end-user */
356 if (!g_str_equal (category, "reading-list") &&
357 !g_str_equal (category, "read") &&
358 !strstr(category, "user/-/label/"))
359 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "category", escaped);
360
361 g_free (escaped);
362 xmlFree (category);
363 }
364 }
365
366 static void
atom10_parse_entry_content(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)367 atom10_parse_entry_content (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
368 {
369 gchar *content;
370
371 content = atom10_parse_content_construct (cur, ctxt);
372 if (content) {
373 item_set_description (ctxt->item, content);
374 g_free (content);
375 }
376 }
377
378 static void
atom10_parse_entry_contributor(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)379 atom10_parse_entry_contributor (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
380 {
381 gchar *contributor;
382
383 contributor = atom10_parse_person_construct (cur);
384 if (contributor) {
385 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "contributor", contributor);
386 g_free (contributor);
387 }
388 }
389
390 static void
atom10_parse_entry_id(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)391 atom10_parse_entry_id (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
392 {
393 gchar *id;
394
395 id = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
396 if (id) {
397 if (strlen (id) > 0) {
398 item_set_id (ctxt->item, id);
399 ctxt->item->validGuid = TRUE;
400 }
401 g_free (id);
402 }
403 }
404
405 static void
atom10_parse_entry_link(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)406 atom10_parse_entry_link (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
407 {
408 gchar *href;
409
410 href = atom10_parse_link (cur, ctxt, state);
411 if (href) {
412 item_set_source (ctxt->item, href);
413 g_free (href);
414 }
415 }
416
417 static void
atom10_parse_entry_published(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)418 atom10_parse_entry_published (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
419 {
420 gchar *datestr;
421
422 datestr = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
423 if (datestr) {
424 ctxt->item->time = date_parse_ISO8601 (datestr);
425 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "pubDate", datestr);
426 g_free (datestr);
427 }
428 }
429
430 static void
atom10_parse_entry_rights(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)431 atom10_parse_entry_rights (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
432 {
433 gchar *rights;
434
435 rights = atom10_parse_text_construct (cur, FALSE);
436 if (rights) {
437 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "copyright", rights);
438 g_free (rights);
439 }
440 }
441
442 /* <summary> can be used for short text descriptions, if there is no
443 <content> description we show the <summary> content */
444 static void
atom10_parse_entry_summary(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)445 atom10_parse_entry_summary (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
446 {
447 gchar *summary;
448
449 summary = atom10_parse_text_construct (cur, TRUE);
450 if (summary) {
451 item_set_description (ctxt->item, summary);
452 g_free (summary);
453 }
454 /* FIXME: set a flag to show a "Read more" link to the user; but where? */
455 }
456
457 static void
atom10_parse_entry_title(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)458 atom10_parse_entry_title (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
459 {
460 gchar *title;
461
462 title = atom10_parse_text_construct(cur, FALSE);
463 if (title) {
464 item_set_title (ctxt->item, title);
465 g_free (title);
466 }
467 }
468
469 static void
atom10_parse_entry_updated(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)470 atom10_parse_entry_updated (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
471 {
472 gchar *datestr;
473
474 datestr = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
475 /* if pubDate is already set, don't overwrite it */
476 if (datestr && !metadata_list_get(ctxt->item->metadata, "pubDate")) {
477 ctxt->item->time = date_parse_ISO8601 (datestr);
478 ctxt->item->metadata = metadata_list_append (ctxt->item->metadata, "contentUpdateDate", datestr);
479 }
480
481 g_free (datestr);
482 }
483
484 /* <content> tag support, FIXME: base64 not supported */
485 /* method to parse standard tags for each item element */
486 static itemPtr
atom10_parse_entry(feedParserCtxtPtr ctxt,xmlNodePtr cur)487 atom10_parse_entry (feedParserCtxtPtr ctxt, xmlNodePtr cur)
488 {
489 NsHandler *nsh;
490 parseItemTagFunc pf;
491 atom10ElementParserFunc func;
492 static GHashTable *entryElementHash = NULL;
493
494 if (!entryElementHash) {
495 entryElementHash = g_hash_table_new (g_str_hash, g_str_equal);
496
497 g_hash_table_insert (entryElementHash, "author", &atom10_parse_entry_author);
498 g_hash_table_insert (entryElementHash, "category", &atom10_parse_entry_category);
499 g_hash_table_insert (entryElementHash, "content", &atom10_parse_entry_content);
500 g_hash_table_insert (entryElementHash, "contributor", &atom10_parse_entry_contributor);
501 g_hash_table_insert (entryElementHash, "id", &atom10_parse_entry_id);
502 g_hash_table_insert (entryElementHash, "link", &atom10_parse_entry_link);
503 g_hash_table_insert (entryElementHash, "published", &atom10_parse_entry_published);
504 g_hash_table_insert (entryElementHash, "rights", &atom10_parse_entry_rights);
505 /* FIXME: Parse "source" */
506 g_hash_table_insert (entryElementHash, "summary", &atom10_parse_entry_summary);
507 g_hash_table_insert (entryElementHash, "title", &atom10_parse_entry_title);
508 g_hash_table_insert (entryElementHash, "updated", &atom10_parse_entry_updated);
509 }
510
511 ctxt->item = item_new ();
512
513 cur = cur->xmlChildrenNode;
514 while (cur) {
515
516 if (cur->type != XML_ELEMENT_NODE || cur->name == NULL || cur->ns == NULL) {
517 cur = cur->next;
518 continue;
519 }
520
521 if ((cur->ns->href && (nsh = (NsHandler *)g_hash_table_lookup (ns_atom10_ns_uri_table, (gpointer)cur->ns->href))) ||
522 (cur->ns->prefix && (nsh = (NsHandler *)g_hash_table_lookup (atom10_nstable, (gpointer)cur->ns->prefix)))) {
523
524 pf = nsh->parseItemTag;
525 if (pf)
526 (*pf) (ctxt, cur);
527 cur = cur->next;
528 continue;
529 }
530
531 /* check namespace of this tag */
532 if (!cur->ns->href) {
533 /* This is an invalid feed... no idea what to do with the current element */
534 debug1 (DEBUG_PARSING, "element with no namespace found in atom feed (%s)!", cur->name);
535 cur = cur->next;
536 continue;
537 }
538
539
540 if (xmlStrcmp(cur->ns->href, ATOM10_NS)) {
541 debug1(DEBUG_PARSING, "unknown namespace %s found!", cur->ns->href);
542 cur = cur->next;
543 continue;
544 }
545 /* At this point, the namespace must be the Atom 1.0 namespace */
546 func = g_hash_table_lookup (entryElementHash, cur->name);
547 if (func) {
548 (*func) (cur, ctxt, NULL);
549 } else {
550 debug1 (DEBUG_PARSING, "unknown entry element \"%s\" found", cur->name);
551 }
552
553 cur = cur->next;
554 }
555
556 /* after parsing we fill the infos into the itemPtr structure */
557 ctxt->item->readStatus = FALSE;
558
559 if (0 == ctxt->item->time)
560 ctxt->item->time = ctxt->feed->time;
561
562 return ctxt->item;
563 }
564
565 static void
atom10_parse_feed_author(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)566 atom10_parse_feed_author (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
567 {
568 /* parse feed author */
569 gchar *author = atom10_parse_person_construct (cur);
570 if (author) {
571 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "author", author);
572 g_free (author);
573 }
574 /* FIXME: make item parsing use this author if not specified elsewhere */
575 }
576
577 static void
atom10_parse_feed_category(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)578 atom10_parse_feed_category (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
579 {
580 gchar *label = NULL;
581
582 label = xml_get_ns_attribute (cur, "label", NULL);
583 if (!label)
584 label = xml_get_ns_attribute (cur, "term", NULL);
585
586 if (label) {
587 gchar *escaped = g_markup_escape_text (label, -1);
588 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "category", escaped);
589 g_free (escaped);
590 xmlFree (label);
591 }
592 }
593
594 static void
atom10_parse_feed_contributor(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)595 atom10_parse_feed_contributor (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
596 {
597 /* parse feed contributors */
598 gchar *contributer = atom10_parse_person_construct (cur);
599 if (contributer) {
600 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "contributor", contributer);
601 g_free (contributer);
602 }
603 }
604
605 static void
atom10_parse_feed_generator(xmlNodePtr cur,feedParserCtxtPtr ctxt,itemPtr ip,struct atom10ParserState * state)606 atom10_parse_feed_generator (xmlNodePtr cur, feedParserCtxtPtr ctxt, itemPtr ip, struct atom10ParserState *state)
607 {
608 gchar *ret, *version, *tmp = NULL, *uri;
609
610 ret = unhtmlize ((gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1));
611 if (ret && ret[0] != '\0') {
612 version = xml_get_ns_attribute (cur, "version", NULL);
613 if (version) {
614 tmp = g_strdup_printf ("%s %s", ret, version);
615 g_free (ret);
616 g_free (version);
617 ret = tmp;
618 }
619 uri = xml_get_ns_attribute (cur, "uri", NULL);
620 if (uri) {
621 tmp = g_markup_printf_escaped ("<a href=\"%s\">%s</a>", uri, ret);
622 g_free (uri);
623 g_free (ret);
624 ret = tmp;
625 }
626 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "feedgenerator", tmp);
627 }
628 g_free (ret);
629 }
630
631 static void
atom10_parse_feed_icon(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)632 atom10_parse_feed_icon (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
633 {
634 gchar *icon_uri;
635
636 icon_uri = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
637
638 if (icon_uri) {
639 debug1 (DEBUG_PARSING, "icon URI found in atom feed: %s", icon_uri);
640 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata,
641 "icon", icon_uri);
642 }
643 }
644
645 static void
atom10_parse_feed_id(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)646 atom10_parse_feed_id (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
647 {
648 /* FIXME: Parse ID, but I'm not sure where Liferea would use it */
649 }
650
651 static void
atom10_parse_feed_link(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)652 atom10_parse_feed_link (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
653 {
654 gchar *href;
655
656 href = atom10_parse_link (cur, ctxt, state);
657 if (href) {
658 xmlChar *baseURL = xmlNodeGetBase (cur->doc, xmlDocGetRootElement (cur->doc));
659
660 subscription_set_homepage (ctxt->subscription, href);
661 /* Set the default base to the feed's HTML URL if not set yet */
662 if (baseURL == NULL)
663 xmlNodeSetBase (xmlDocGetRootElement (cur->doc), (xmlChar *)href);
664 else xmlFree (baseURL);
665 g_free (href);
666 }
667 }
668
669 static void
atom10_parse_feed_logo(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)670 atom10_parse_feed_logo (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
671 {
672 gchar *logoUrl;
673
674 logoUrl = atom10_parse_text_construct (cur, FALSE);
675 if (logoUrl) {
676 metadata_list_set (&ctxt->subscription->metadata, "imageUrl", logoUrl);
677 g_free (logoUrl);
678 }
679 }
680
681 static void
atom10_parse_feed_rights(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)682 atom10_parse_feed_rights (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
683 {
684 gchar *rights;
685
686 rights = atom10_parse_text_construct (cur, FALSE);
687 if (rights) {
688 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "copyright", rights);
689 g_free (rights);
690 }
691 }
692
693 static void
atom10_parse_feed_subtitle(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)694 atom10_parse_feed_subtitle (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
695 {
696 gchar *subtitle;
697
698 subtitle = atom10_parse_text_construct (cur, TRUE);
699 if (subtitle) {
700 metadata_list_set (&ctxt->subscription->metadata, "description", subtitle);
701 g_free (subtitle);
702 }
703 }
704
705 static void
atom10_parse_feed_title(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)706 atom10_parse_feed_title (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
707 {
708 gchar *title;
709
710 title = atom10_parse_text_construct(cur, FALSE);
711 if (title) {
712 if (ctxt->title)
713 g_free (ctxt->title);
714 ctxt->title = title;
715 }
716 }
717
718 /* Sort items in descending date order (newer items first). */
719 static gint
atom10_item_sort_by_date(gconstpointer a,gconstpointer b)720 atom10_item_sort_by_date (gconstpointer a, gconstpointer b)
721 {
722 itemPtr item1 = (itemPtr)a;
723 itemPtr item2 = (itemPtr)b;
724
725 g_assert (item1 && item2);
726
727 if (item1->time == item2->time) {
728 /* Items identical.. can we distinguish further? */
729 return 0;
730 }
731
732 if (item1->time < item2->time)
733 return 1;
734 if (item1->time > item2->time)
735 return -1;
736
737 return 0;
738 }
739
740 static void
atom10_parse_feed_updated(xmlNodePtr cur,feedParserCtxtPtr ctxt,struct atom10ParserState * state)741 atom10_parse_feed_updated (xmlNodePtr cur, feedParserCtxtPtr ctxt, struct atom10ParserState *state)
742 {
743 gchar *timestamp;
744
745 timestamp = (gchar *)xmlNodeListGetString (cur->doc, cur->xmlChildrenNode, 1);
746 if (timestamp) {
747 ctxt->subscription->metadata = metadata_list_append (ctxt->subscription->metadata, "contentUpdateDate", timestamp);
748 ctxt->feed->time = date_parse_ISO8601 (timestamp);
749 g_free (timestamp);
750 }
751 }
752
753 /* reads a Atom feed URL and returns a new channel structure (even if
754 the feed could not be read) */
755 static void
atom10_parse_feed(feedParserCtxtPtr ctxt,xmlNodePtr cur)756 atom10_parse_feed (feedParserCtxtPtr ctxt, xmlNodePtr cur)
757 {
758 NsHandler *nsh;
759 parseChannelTagFunc pf;
760 atom10ElementParserFunc func;
761 static GHashTable *feedElementHash = NULL;
762
763 if(!feedElementHash) {
764 feedElementHash = g_hash_table_new (g_str_hash, g_str_equal);
765
766 g_hash_table_insert (feedElementHash, "author", &atom10_parse_feed_author);
767 g_hash_table_insert (feedElementHash, "category", &atom10_parse_feed_category);
768 g_hash_table_insert (feedElementHash, "contributor", &atom10_parse_feed_contributor);
769 g_hash_table_insert (feedElementHash, "generator", &atom10_parse_feed_generator);
770 g_hash_table_insert (feedElementHash, "icon", &atom10_parse_feed_icon);
771 g_hash_table_insert (feedElementHash, "id", &atom10_parse_feed_id);
772 g_hash_table_insert (feedElementHash, "link", &atom10_parse_feed_link);
773 g_hash_table_insert (feedElementHash, "logo", &atom10_parse_feed_logo);
774 g_hash_table_insert (feedElementHash, "rights", &atom10_parse_feed_rights);
775 g_hash_table_insert (feedElementHash, "subtitle", &atom10_parse_feed_subtitle);
776 g_hash_table_insert (feedElementHash, "title", &atom10_parse_feed_title);
777 g_hash_table_insert (feedElementHash, "updated", &atom10_parse_feed_updated);
778 }
779
780 while (TRUE) {
781 if (xmlStrcmp (cur->name, BAD_CAST"feed")) {
782 g_string_append (ctxt->feed->parseErrors, "<p>Could not find Atom 1.0 header!</p>");
783 break;
784 }
785
786 /* parse feed contents */
787 cur = cur->xmlChildrenNode;
788 while (cur) {
789 if (!cur->name || cur->type != XML_ELEMENT_NODE || !cur->ns) {
790 cur = cur->next;
791 continue;
792 }
793
794 /* check if supported namespace should handle the current tag
795 by trying to determine a namespace handler */
796
797 nsh = NULL;
798
799 if (cur->ns->href)
800 nsh = (NsHandler *)g_hash_table_lookup (ns_atom10_ns_uri_table, (gpointer)cur->ns->href);
801
802 if (cur->ns->prefix && !nsh)
803 nsh = (NsHandler *)g_hash_table_lookup (atom10_nstable, (gpointer)cur->ns->prefix);
804
805 if(nsh) {
806 pf = nsh->parseChannelTag;
807 if(pf)
808 (*pf)(ctxt, cur);
809 cur = cur->next;
810 continue;
811 }
812
813 /* check namespace of this tag */
814 if (!cur->ns->href) {
815 /* This is an invalid feed... no idea what to do with the current element */
816 debug1 (DEBUG_PARSING, "element with no namespace found in atom feed (%s)!", cur->name);
817 cur = cur->next;
818 continue;
819 }
820
821 if (xmlStrcmp (cur->ns->href, ATOM10_NS)) {
822 debug1 (DEBUG_PARSING, "unknown namespace %s found in atom feed!", cur->ns->href);
823 cur = cur->next;
824 continue;
825 }
826 /* At this point, the namespace must be the Atom 1.0 namespace */
827
828 func = g_hash_table_lookup (feedElementHash, cur->name);
829 if (func) {
830 (*func) (cur, ctxt, NULL);
831 } else if (xmlStrEqual (cur->name, BAD_CAST"entry")) {
832 ctxt->item = atom10_parse_entry (ctxt, cur);
833 if (ctxt->item)
834 ctxt->items = g_list_insert_sorted (ctxt->items, ctxt->item, atom10_item_sort_by_date);
835 }
836 cur = cur->next;
837 }
838
839 /* FIXME: Maybe check to see that the required information was actually provided (persuant to the RFC). */
840 /* after parsing we fill in the infos into the feedPtr structure */
841
842 break;
843 }
844 }
845
846 static gboolean
atom10_format_check(xmlDocPtr doc,xmlNodePtr cur)847 atom10_format_check (xmlDocPtr doc, xmlNodePtr cur)
848 {
849 if (cur->name == NULL || cur->ns == NULL || cur->ns->href == NULL)
850 return FALSE;
851 return xmlStrEqual (cur->name, BAD_CAST"feed") && xmlStrEqual (cur->ns->href, ATOM10_NS);
852 }
853
854 static void
atom10_add_ns_handler(NsHandler * handler)855 atom10_add_ns_handler (NsHandler *handler)
856 {
857 g_assert (NULL != atom10_nstable);
858 g_hash_table_insert (atom10_nstable, (gpointer)handler->prefix, handler);
859 g_assert (handler->registerNs != NULL);
860 handler->registerNs (handler, atom10_nstable, ns_atom10_ns_uri_table);
861 }
862
863 feedHandlerPtr
atom10_init_feed_handler(void)864 atom10_init_feed_handler (void)
865 {
866 feedHandlerPtr fhp;
867
868 fhp = g_new0 (struct feedHandler, 1);
869
870 if (!atom10_nstable) {
871 atom10_nstable = g_hash_table_new (g_str_hash, g_str_equal);
872 ns_atom10_ns_uri_table = g_hash_table_new (g_str_hash, g_str_equal);
873
874 /* register name space handlers */
875 atom10_add_ns_handler (ns_bC_get_handler ());
876 atom10_add_ns_handler (ns_dc_get_handler ());
877 atom10_add_ns_handler (ns_slash_get_handler ());
878 atom10_add_ns_handler (ns_content_get_handler ());
879 atom10_add_ns_handler (ns_syn_get_handler ());
880 atom10_add_ns_handler (ns_admin_get_handler ());
881 atom10_add_ns_handler (ns_ag_get_handler ());
882 atom10_add_ns_handler (ns_cC_get_handler ());
883 atom10_add_ns_handler (ns_photo_get_handler ());
884 atom10_add_ns_handler (ns_pb_get_handler ());
885 atom10_add_ns_handler (ns_wfw_get_handler ());
886 atom10_add_ns_handler (ns_media_get_handler ());
887 atom10_add_ns_handler (ns_trackback_get_handler ());
888 atom10_add_ns_handler (ns_georss_get_handler ());
889 }
890 /* prepare feed handler structure */
891 fhp->typeStr = "atom";
892 fhp->feedParser = atom10_parse_feed;
893 fhp->checkFormat = atom10_format_check;
894
895 return fhp;
896 }
897