1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19 #define __USE_GNU
20 
21 #include <glib.h>
22 #include <expat.h>
23 #include <string.h>
24 #include <stdio.h>
25 
26 #include <procheader.h>
27 
28 #include "feed.h"
29 #include "feeditem.h"
30 #include "date.h"
31 #include "parser.h"
32 #include "parser_atom10.h"
33 
34 enum {
35 	FEED_LOC_ATOM10_NONE,
36 	FEED_LOC_ATOM10_ENTRY,
37 	FEED_LOC_ATOM10_AUTHOR,
38 	FEED_LOC_ATOM10_SOURCE,
39 	FEED_LOC_ATOM10_CONTENT
40 } FeedAtom10Locations;
41 
feed_parser_atom10_start(void * data,const gchar * el,const gchar ** attr)42 void feed_parser_atom10_start(void *data, const gchar *el, const gchar **attr)
43 {
44 	FeedParserCtx *ctx = (FeedParserCtx *)data;
45 	gchar *a = NULL;
46 
47 	if( ctx->depth == 1 ) {
48 
49 		if( !strcmp(el, "entry") ) {
50 			/* Start of new feed item found.
51 			 * Create a new FeedItem, freeing the one we already have, if any. */
52 			if( ctx->curitem != NULL )
53 				feed_item_free(ctx->curitem);
54 			ctx->curitem = feed_item_new(ctx->feed);
55 			ctx->location = FEED_LOC_ATOM10_ENTRY;
56 		} else if( !strcmp(el, "author") ) {
57 			/* Start of author info for the feed found.
58 			 * Set correct location. */
59 			ctx->location = FEED_LOC_ATOM10_AUTHOR;
60 		} else if( !strcmp(el, "link") ) {
61 			if (!feed_parser_get_attribute_value(attr, "rel")) {
62 				/* Link tag for the feed */
63 				g_free(ctx->feed->link);
64 				ctx->feed->link =
65 					g_strdup(feed_parser_get_attribute_value(attr, "href"));
66 			}
67 		} else ctx->location = FEED_LOC_ATOM10_NONE;
68 
69 	} else if( ctx->depth == 2 ) {
70 
71 		/* Make sure we are in one of known locations within the XML structure.
72 		 * This condition should never be true on a valid Atom feed. */
73 		if (ctx->location != FEED_LOC_ATOM10_AUTHOR &&
74 				ctx->location != FEED_LOC_ATOM10_ENTRY) {
75 			ctx->depth++;
76 			return;
77 		}
78 
79 		if( !strcmp(el, "author") ) {
80 			/* Start of author info for current feed item.
81 			 * Set correct location. */
82 			ctx->location = FEED_LOC_ATOM10_AUTHOR;
83 		} else if( !strcmp(el, "link") ) {
84 			/* Capture item URL, from the "url" XML attribute. */
85 			if (ctx->curitem && ctx->location == FEED_LOC_ATOM10_ENTRY)
86 				ctx->curitem->url = g_strdup(feed_parser_get_attribute_value(attr, "href"));
87 		} else if( !strcmp(el, "source") ) {
88 			ctx->location = FEED_LOC_ATOM10_SOURCE;
89 		} else ctx->location = FEED_LOC_ATOM10_ENTRY;
90 
91 		if( !strcmp(el, "title") && ctx->curitem != NULL) {
92 			a = feed_parser_get_attribute_value(attr, "type");
93 			if( !a || !strcmp(a, "text") )
94 				ctx->curitem->title_format = FEED_ITEM_TITLE_TEXT;
95 			else if( !strcmp(a, "html") )
96 				ctx->curitem->title_format = FEED_ITEM_TITLE_HTML;
97 			else if( !strcmp(a, "xhtml") )
98 				ctx->curitem->title_format = FEED_ITEM_TITLE_XHTML;
99 			else
100 				ctx->curitem->title_format = FEED_ITEM_TITLE_UNKNOWN;
101 		} else if (!strcmp(el, "content") && ctx->curitem != NULL) {
102 			ctx->location = FEED_LOC_ATOM10_CONTENT;
103 			a = feed_parser_get_attribute_value(attr, "type");
104 			if (a && !strcmp(a, "xhtml")) {
105 				ctx->curitem->xhtml_content = TRUE;
106 				ctx->xhtml_str = g_string_new(NULL);
107 			}
108 		}
109 	} else if (ctx->depth >= 3) {
110 		if (ctx->location == FEED_LOC_ATOM10_CONTENT
111 				&& ctx->curitem != NULL
112 				&& ctx->curitem->xhtml_content) {
113 			guint i;
114 			GString *txt = ctx->xhtml_str;
115 			g_string_append_c(txt, '<');
116 			g_string_append(txt, el);
117 
118 			for (i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2) {
119 				g_string_append_printf(txt, " %s='%s'", attr[i], attr[i+1]);
120 			}
121 			g_string_append_c(txt, '>');
122 		}
123 	}
124 
125 
126 	ctx->depth++;
127 }
128 
feed_parser_atom10_end(void * data,const gchar * el)129 void feed_parser_atom10_end(void *data, const gchar *el)
130 {
131 	FeedParserCtx *ctx = (FeedParserCtx *)data;
132 	Feed *feed = ctx->feed;
133 	gchar *text = NULL, *tmp;
134 
135 	if( ctx->str != NULL )
136 		text = g_strstrip(g_strdup(ctx->str->str));
137 	else
138 		text = "";
139 
140 	switch( ctx->depth ) {
141 
142 		case 0:
143 			/* Just in case. */
144 			break;
145 
146 		case 1:
147 
148 			if( !strcmp(el, "feed") ) {
149 				/* We have finished parsing the feed, reverse the list
150 				 * so it's not upside down. */
151 				feed->items = g_slist_reverse(ctx->feed->items);
152 			}
153 
154 			break;
155 
156 		case 2:
157 
158 			/* decide if we just received </entry>, so we can
159 			 * add a complete item to feed */
160 			if( !strcmp(el, "entry") ) {
161 
162 				/* Fix up URL, if it is relative */
163 				if (ctx->curitem->url != NULL &&
164 						!strstr(ctx->curitem->url, "://") &&
165 						ctx->feed->link != NULL) {
166 					tmp = g_strconcat(ctx->feed->link,
167 							(ctx->curitem->url[0] == '/' ? "" : "/"),
168 							ctx->curitem->url, NULL);
169 					feed_item_set_url(ctx->curitem, tmp);
170 					g_free(tmp);
171 				}
172 
173 				/* append the complete feed item */
174 				if( ctx->curitem->id && ctx->curitem->title
175 						&& ctx->curitem->date_modified ) {
176 					feed->items =
177 						g_slist_prepend(feed->items, (gpointer)ctx->curitem);
178 				}
179 
180 				/* since it's in the linked list, lose this pointer */
181 				ctx->curitem = NULL;
182 
183 			} else if( !strcmp(el, "title") ) {	/* so it wasn't end of item */
184 				FILL(feed->title)
185 			} else if( !strcmp(el, "summary" ) ) {
186 				FILL(feed->description)
187 			} else if( !strcmp(el, "updated" ) ) {
188 				feed->date = procheader_date_parse(NULL, text, 0);
189 			}
190 			/* FIXME: add more later */
191 
192 			break;
193 
194 		case 3:
195 
196 			if( ctx->curitem == NULL )
197 				break;
198 
199 			switch(ctx->location) {
200 
201 				/* We're in feed/entry */
202 				case FEED_LOC_ATOM10_ENTRY:
203 					if( !strcmp(el, "title") ) {
204 						FILL(ctx->curitem->title)
205 					} else if( !strcmp(el, "summary") ) {
206 						FILL(ctx->curitem->summary)
207 					} else if( !strcmp(el, "id") ) {
208 						FILL(ctx->curitem->id)
209 						feed_item_set_id_permalink(ctx->curitem, TRUE);
210 					} else if( !strcmp(el, "published") ) {
211 						ctx->curitem->date_published = procheader_date_parse(NULL, text, 0);
212 					} else if( !strcmp(el, "updated") ) {
213 						ctx->curitem->date_modified = procheader_date_parse(NULL, text, 0);
214 					}
215 
216 					break;
217 
218 				/* We're in feed/author or about to leave feed/entry/author */
219 				case FEED_LOC_ATOM10_AUTHOR:
220 					if( !strcmp(el, "author" ) ) {
221 						/* We just finished parsing <author> */
222 						ctx->curitem->author = g_strdup_printf("%s%s%s%s%s",
223 								ctx->name ? ctx->name : "",
224 								ctx->name && ctx->mail ? " <" : ctx->mail ? "<" : "",
225 								ctx->mail ? ctx->mail : "",
226 								ctx->mail ? ">" : "",
227 								!ctx->name && !ctx->mail ? "N/A" : "");
228 						ctx->location = FEED_LOC_ATOM10_ENTRY;
229 					} else if( !strcmp(el, "name") ) {
230 						FILL(feed->author)
231 					}
232 
233 					break;
234 
235 				case FEED_LOC_ATOM10_CONTENT:
236 					if( !strcmp(el, "content") ) {
237 						if (ctx->curitem->xhtml_content) {
238 							/* Just in case the <content> tag itself also has some
239 							 * content of its own, not just the <div> it should,
240 							 * let's append it to the end. */
241 							g_string_append(ctx->xhtml_str, text);
242 							ctx->curitem->text = g_string_free(ctx->xhtml_str, FALSE);
243 							ctx->xhtml_str = NULL;
244 						} else {
245 							FILL(ctx->curitem->text)
246 						}
247 						ctx->location = FEED_LOC_ATOM10_ENTRY;
248 					}
249 
250 					break;
251 			}
252 			break;
253 
254 		case 4:
255 
256 			if( ctx->curitem == NULL )
257 				break;
258 
259 			switch(ctx->location) {
260 
261 				/* We're in feed/entry/author */
262 				case FEED_LOC_ATOM10_AUTHOR:
263 					if( !strcmp(el, "name") ) {
264 						FILL(ctx->name)
265 					} else if( !strcmp(el, "email") ) {
266 						FILL(ctx->mail)
267 					}
268 
269 					break;
270 
271 				/* We're in feed/entry/source */
272 				case FEED_LOC_ATOM10_SOURCE:
273 					if( !strcmp(el, "title" ) ) {
274 						FILL(ctx->curitem->sourcetitle)
275 					} else if( !strcmp(el, "id" ) ) {
276 						FILL(ctx->curitem->sourceid)
277 					} else if( !strcmp(el, "updated" ) ) {
278 						ctx->curitem->sourcedate = procheader_date_parse(NULL, text, 0);
279 					}
280 
281 					break;
282 
283 				case FEED_LOC_ATOM10_CONTENT:
284 					if (ctx->curitem->xhtml_content) {
285 						g_string_append(ctx->xhtml_str, text);
286 						g_string_append_printf(ctx->xhtml_str, "</%s>", el);
287 					}
288 					break;
289 
290 				}
291 
292 
293 			break;
294 
295 		default:
296 			if (ctx->location == FEED_LOC_ATOM10_CONTENT
297 					&& ctx->curitem->xhtml_content) {
298 				g_string_append(ctx->xhtml_str, text);
299 				g_string_append_printf(ctx->xhtml_str, "</%s>", el);
300 			}
301 			break;
302 	}
303 
304 	if( ctx->str != NULL ) {
305 		g_free(text);
306 		g_string_free(ctx->str, TRUE);
307 		ctx->str = NULL;
308 	}
309 	ctx->str = NULL;
310 
311 	ctx->depth--;
312 }
313