1 /*
2  * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3  *
4  * This program is free software; you can redistribute it and/or
5  * modify it under the terms of the GNU General Public License as
6  * published by the Free Software Foundation; either version 2 of the
7  * License, or (at your option) any later version.
8  *
9  * This program is distributed in the hope that it will be useful,
10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12  * General Public License for more details.
13  *
14  * You should have received a copy of the GNU General Public
15  * License along with this program; if not, write to the
16  * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17  * Boston, MA 02111-1307, USA.
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23 
24 #include <glib.h>
25 #include <curl/curl.h>
26 #include <expat.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <errno.h>
30 
31 #include "feed.h"
32 
33 #include "parser.h"
34 
35 enum {
36 	FEED_TYPE_NONE,
37 	FEED_TYPE_RDF,
38 	FEED_TYPE_RSS_20,
39 	FEED_TYPE_ATOM_03,
40 	FEED_TYPE_ATOM_10,
41 	FEED_TYPE_OPML
42 } FeedTypes;
43 
_handler_set(XML_Parser parser,guint type)44 static void _handler_set(XML_Parser parser, guint type)
45 {
46 	if( parser == NULL )
47 		return;
48 
49 	switch(type) {
50 		case FEED_TYPE_RSS_20:
51 			XML_SetElementHandler(parser,
52 					feed_parser_rss20_start,
53 					feed_parser_rss20_end);
54 			break;
55 
56 		case FEED_TYPE_RDF:
57 			XML_SetElementHandler(parser,
58 					feed_parser_rdf_start,
59 					feed_parser_rdf_end);
60 			break;
61 
62 		case FEED_TYPE_ATOM_10:
63 			XML_SetElementHandler(parser,
64 					feed_parser_atom10_start,
65 					feed_parser_atom10_end);
66 			break;
67 	}
68 }
69 
_elparse_start_chooser(void * data,const gchar * el,const gchar ** attr)70 static void _elparse_start_chooser(void *data,
71 		const gchar *el, const gchar **attr)
72 {
73 	FeedParserCtx *ctx = (FeedParserCtx *)data;
74 	guint feedtype = FEED_TYPE_NONE;
75 	gchar *version;
76 
77 	if( ctx->depth == 0 ) {
78 
79 		/* RSS 2.0 detected */
80 		if( !strcmp(el, "rss") ) {
81 			feedtype = FEED_TYPE_RSS_20;
82 		} else if( !strcmp(el, "rdf:RDF") ) {
83 			feedtype = FEED_TYPE_RDF;
84 		} else if( !strcmp(el, "feed") ) {
85 
86 			/* ATOM feed detected, let's check version */
87 			version = feed_parser_get_attribute_value(attr, "xmlns");
88 			if( version != NULL &&
89 					(!strcmp(version, "http://www.w3.org/2005/Atom") ||
90 					 !strcmp(version, "https://www.w3.org/2005/Atom")) )
91 				feedtype = FEED_TYPE_ATOM_10;
92 			else
93 				feedtype = FEED_TYPE_ATOM_03;
94 		} else {
95 			/* Not a known feed type */
96 			ctx->feed->is_valid = FALSE;
97 		}
98 	}
99 
100 	_handler_set(ctx->parser, feedtype);
101 
102 	ctx->depth++;
103 }
104 
_elparse_end_dummy(void * data,const gchar * el)105 static void _elparse_end_dummy(void *data, const gchar *el)
106 {
107 	FeedParserCtx *ctx = (FeedParserCtx *)data;
108 
109 	if( ctx->str != NULL ) {
110 		g_string_free(ctx->str, TRUE);
111 		ctx->str = NULL;
112 	}
113 
114 	ctx->depth--;
115 }
116 
libfeed_expat_chparse(void * data,const gchar * s,gint len)117 void libfeed_expat_chparse(void *data, const gchar *s, gint len)
118 {
119 	FeedParserCtx *ctx = (FeedParserCtx *)data;
120 	gchar *buf = NULL;
121 	gint i, xblank = 1;
122 
123 	buf = malloc(len+1);
124 	strncpy(buf, s, len);
125 	buf[len] = '\0';
126 
127 	/* check if the string is blank, ... */
128 	for( i = 0; i < strlen(buf); i++ )
129 		if( !isspace(buf[i]) )
130 			xblank = 0;
131 
132 	/* ...because we do not want the blanks if we're just starting new GString */
133 	if( xblank > 0 && ctx->str == NULL ) {
134 		g_free(buf);
135 		return;
136 	}
137 
138 	if( ctx->str == NULL ) {
139 		ctx->str = g_string_sized_new(len + 1);
140 	}
141 
142 	g_string_append(ctx->str, buf);
143 	g_free(buf);
144 }
145 
146 
feed_parser_set_expat_handlers(FeedParserCtx * ctx)147 void feed_parser_set_expat_handlers(FeedParserCtx *ctx)
148 {
149 	XML_SetUserData(ctx->parser, (void *)ctx);
150 
151 	XML_SetElementHandler(ctx->parser,
152 			_elparse_start_chooser,
153 			_elparse_end_dummy);
154 
155 	XML_SetCharacterDataHandler(ctx->parser,
156 		libfeed_expat_chparse);
157 
158 	XML_SetUnknownEncodingHandler(ctx->parser, feed_parser_unknown_encoding_handler,
159 			NULL);
160 }
161 
feed_writefunc(void * ptr,size_t size,size_t nmemb,void * data)162 size_t feed_writefunc(void *ptr, size_t size, size_t nmemb, void *data)
163 {
164 	gint len = size * nmemb;
165 	FeedParserCtx *ctx = (FeedParserCtx *)data;
166 	gint status, err;
167 
168 	if (!ctx->feed->is_valid) {
169 		/* We already know that the feed is not valid, so we won't
170 		 * try parsing it. Just return correct number so libcurl is
171 		 * happy. */
172 		return len;
173 	}
174 
175 	status = XML_Parse(ctx->parser, ptr, len, FALSE);
176 
177 	if( status == XML_STATUS_ERROR ) {
178 		err = XML_GetErrorCode(ctx->parser);
179 		printf("\nExpat: --- %s\n\n", XML_ErrorString(err));
180 		ctx->feed->is_valid = FALSE;
181 	}
182 
183 	return len;
184 }
185 
feed_parser_get_attribute_value(const gchar ** attr,const gchar * name)186 gchar *feed_parser_get_attribute_value(const gchar **attr, const gchar *name)
187 {
188 	guint i;
189 
190 	if( attr == NULL || name == NULL )
191 		return NULL;
192 
193 	for( i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2 ) {
194 		if( !strcmp( attr[i], name) )
195 			return (gchar *)attr[i+1];
196 	}
197 
198 	/* We haven't found anything. */
199 	return NULL;
200 }
201 
202 #define CHARSIZEUTF32	4
203 
204 enum {
205 	LEP_ICONV_OK,
206 	LEP_ICONV_FAILED,
207 	LEP_ICONV_ILSEQ,
208 	LEP_ICONV_INVAL,
209 	LEP_ICONV_UNKNOWN
210 };
211 
giconv_utf32_char(GIConv cd,const gchar * inbuf,size_t insize,guint32 * p_value)212 static gint giconv_utf32_char(GIConv cd, const gchar *inbuf, size_t insize,
213 		guint32 *p_value)
214 {
215 #ifdef HAVE_ICONV
216 	size_t outsize;
217 	guchar outbuf[CHARSIZEUTF32];
218 	gchar *outbufp;
219 	gint r;
220 
221 	outsize = sizeof(outbuf);
222 	outbufp = (gchar *)outbuf;
223 #ifdef HAVE_ICONV_PROTO_CONST
224 	r = g_iconv(cd, (const gchar **)&inbuf, &insize,
225 			&outbufp, &outsize);
226 #else
227 	r = g_iconv(cd, (gchar **)&inbuf, &insize,
228 			&outbufp, &outsize);
229 #endif
230 	if( r == -1 ) {
231 		g_iconv(cd, 0, 0, 0, 0);
232 		switch(errno) {
233 		case EILSEQ:
234 			return LEP_ICONV_ILSEQ;
235 		case EINVAL:
236 			return LEP_ICONV_INVAL;
237 		default:
238 			return LEP_ICONV_UNKNOWN;
239 		}
240 	} else {
241 		guint32 value;
242 		guint i;
243 
244 		if( (insize > 0) || (outsize > 0) )
245 			return LEP_ICONV_FAILED;
246 
247 		value = 0;
248 		for( i = 0; i < sizeof(outbuf); i++ ) {
249 			value = (value << 8) + outbuf[i];
250 		}
251 		*p_value = value;
252 		return LEP_ICONV_OK;
253 	}
254 #else
255 	return LEP_ICONV_FAILED;
256 #endif
257 }
258 
feed_parser_setup_unknown_encoding(const gchar * charset,XML_Encoding * info)259 static gint feed_parser_setup_unknown_encoding(const gchar *charset,
260 		XML_Encoding *info)
261 {
262 	GIConv cd;
263 	gint flag, r;
264 	gchar buf[4];
265 	guint i, j, k;
266 	guint32 value;
267 
268 	cd = g_iconv_open("UTF-32BE", charset);
269 	if( cd == (GIConv) -1 )
270 		return -1;
271 
272 	flag = 0;
273 	for( i = 0; i < 256; i++ ) {
274 		/* first char */
275 		buf[0] = i;
276 		info->map[i] = 0;
277 		r = giconv_utf32_char(cd, buf, 1, &value);
278 		if( r == LEP_ICONV_OK) {
279 			info->map[i] = value;
280 		} else if( r != LEP_ICONV_INVAL ) {
281 		} else {
282 			for( j = 0; j < 256; j++ ) {
283 				/* second char */
284 				buf[1] = j;
285 				r = giconv_utf32_char(cd, buf, 2, &value);
286 				if( r == LEP_ICONV_OK ) {
287 					flag = 1;
288 					info->map[i] = -2;
289 				} else if( r != LEP_ICONV_INVAL ) {
290 				} else {
291 					for( k = 0; k < 256; k++ ) {
292 						/* third char */
293 						buf[2] = k;
294 						r = giconv_utf32_char(cd, buf, 3, &value);
295 						if( r == LEP_ICONV_OK) {
296 							info->map[i] = -3;
297 						}
298 					}
299 				}
300 			}
301 		}
302 	}
303 
304 	g_iconv_close(cd);
305 
306 	return flag;
307 }
308 
309 struct FeedParserUnknownEncoding {
310 	gchar *charset;
311 	GIConv cd;
312 };
313 
feed_parser_unknown_encoding_convert(void * data,const gchar * s)314 static gint feed_parser_unknown_encoding_convert(void *data, const gchar *s)
315 {
316 	gint r;
317 	struct FeedParserUnknownEncoding *enc_data;
318 	size_t insize;
319 	guint32 value;
320 
321 	enc_data = data;
322 	insize = 4;
323 
324 	if( s == NULL )
325 		return -1;
326 
327 	r = giconv_utf32_char(enc_data->cd, s, insize, &value);
328 	if( r != LEP_ICONV_OK )
329 		return -1;
330 
331 	return 0;
332 }
333 
feed_parser_unknown_encoding_data_free(void * data)334 static void feed_parser_unknown_encoding_data_free(void *data)
335 {
336 	struct FeedParserUnknownEncoding *enc_data;
337 
338 	enc_data = data;
339 	free(enc_data->charset);
340 	g_iconv_close(enc_data->cd);
341 	free(enc_data);
342 }
343 
feed_parser_unknown_encoding_handler(void * encdata,const XML_Char * name,XML_Encoding * info)344 int feed_parser_unknown_encoding_handler(void *encdata, const XML_Char *name,
345 		XML_Encoding *info)
346 {
347 	GIConv cd;
348 	struct FeedParserUnknownEncoding *data;
349 	int result;
350 
351 	result = feed_parser_setup_unknown_encoding(name, info);
352 	if( result == 0 ) {
353 		info->data = NULL;
354 		info->convert = NULL;
355 		info->release = NULL;
356 		return XML_STATUS_OK;
357 	}
358 
359 	cd = g_iconv_open("UTF-32BE", name);
360 	if( cd == (GIConv)-1 )
361 		return XML_STATUS_ERROR;
362 
363 	data = malloc( sizeof(*data) );
364 	if( data == NULL ) {
365 		g_iconv_close(cd);
366 		return XML_STATUS_ERROR;
367 	}
368 
369 	data->charset = strdup(name);
370 	if( data->charset == NULL ) {
371 		free(data);
372 		g_iconv_close(cd);
373 		return XML_STATUS_ERROR;
374 	}
375 
376 	data->cd = cd;
377 	info->data = data;
378 	info->convert = feed_parser_unknown_encoding_convert;
379 	info->release = feed_parser_unknown_encoding_data_free;
380 
381 	return XML_STATUS_OK;
382 }
383