1 /*
2 * Copyright (C) 2006 Andrej Kacian <andrej@kacian.sk>
3 *
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License as
6 * published by the Free Software Foundation; either version 2 of the
7 * License, or (at your option) any later version.
8 *
9 * This program is distributed in the hope that it will be useful,
10 * but WITHOUT ANY WARRANTY; without even the implied warranty of
11 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
12 * General Public License for more details.
13 *
14 * You should have received a copy of the GNU General Public
15 * License along with this program; if not, write to the
16 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
17 * Boston, MA 02111-1307, USA.
18 */
19
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23
24 #include <glib.h>
25 #include <curl/curl.h>
26 #include <expat.h>
27 #include <string.h>
28 #include <ctype.h>
29 #include <errno.h>
30
31 #include "feed.h"
32
33 #include "parser.h"
34
35 enum {
36 FEED_TYPE_NONE,
37 FEED_TYPE_RDF,
38 FEED_TYPE_RSS_20,
39 FEED_TYPE_ATOM_03,
40 FEED_TYPE_ATOM_10,
41 FEED_TYPE_OPML
42 } FeedTypes;
43
_handler_set(XML_Parser parser,guint type)44 static void _handler_set(XML_Parser parser, guint type)
45 {
46 if( parser == NULL )
47 return;
48
49 switch(type) {
50 case FEED_TYPE_RSS_20:
51 XML_SetElementHandler(parser,
52 feed_parser_rss20_start,
53 feed_parser_rss20_end);
54 break;
55
56 case FEED_TYPE_RDF:
57 XML_SetElementHandler(parser,
58 feed_parser_rdf_start,
59 feed_parser_rdf_end);
60 break;
61
62 case FEED_TYPE_ATOM_10:
63 XML_SetElementHandler(parser,
64 feed_parser_atom10_start,
65 feed_parser_atom10_end);
66 break;
67 }
68 }
69
_elparse_start_chooser(void * data,const gchar * el,const gchar ** attr)70 static void _elparse_start_chooser(void *data,
71 const gchar *el, const gchar **attr)
72 {
73 FeedParserCtx *ctx = (FeedParserCtx *)data;
74 guint feedtype = FEED_TYPE_NONE;
75 gchar *version;
76
77 if( ctx->depth == 0 ) {
78
79 /* RSS 2.0 detected */
80 if( !strcmp(el, "rss") ) {
81 feedtype = FEED_TYPE_RSS_20;
82 } else if( !strcmp(el, "rdf:RDF") ) {
83 feedtype = FEED_TYPE_RDF;
84 } else if( !strcmp(el, "feed") ) {
85
86 /* ATOM feed detected, let's check version */
87 version = feed_parser_get_attribute_value(attr, "xmlns");
88 if( version != NULL &&
89 (!strcmp(version, "http://www.w3.org/2005/Atom") ||
90 !strcmp(version, "https://www.w3.org/2005/Atom")) )
91 feedtype = FEED_TYPE_ATOM_10;
92 else
93 feedtype = FEED_TYPE_ATOM_03;
94 } else {
95 /* Not a known feed type */
96 ctx->feed->is_valid = FALSE;
97 }
98 }
99
100 _handler_set(ctx->parser, feedtype);
101
102 ctx->depth++;
103 }
104
_elparse_end_dummy(void * data,const gchar * el)105 static void _elparse_end_dummy(void *data, const gchar *el)
106 {
107 FeedParserCtx *ctx = (FeedParserCtx *)data;
108
109 if( ctx->str != NULL ) {
110 g_string_free(ctx->str, TRUE);
111 ctx->str = NULL;
112 }
113
114 ctx->depth--;
115 }
116
libfeed_expat_chparse(void * data,const gchar * s,gint len)117 void libfeed_expat_chparse(void *data, const gchar *s, gint len)
118 {
119 FeedParserCtx *ctx = (FeedParserCtx *)data;
120 gchar *buf = NULL;
121 gint i, xblank = 1;
122
123 buf = malloc(len+1);
124 strncpy(buf, s, len);
125 buf[len] = '\0';
126
127 /* check if the string is blank, ... */
128 for( i = 0; i < strlen(buf); i++ )
129 if( !isspace(buf[i]) )
130 xblank = 0;
131
132 /* ...because we do not want the blanks if we're just starting new GString */
133 if( xblank > 0 && ctx->str == NULL ) {
134 g_free(buf);
135 return;
136 }
137
138 if( ctx->str == NULL ) {
139 ctx->str = g_string_sized_new(len + 1);
140 }
141
142 g_string_append(ctx->str, buf);
143 g_free(buf);
144 }
145
146
feed_parser_set_expat_handlers(FeedParserCtx * ctx)147 void feed_parser_set_expat_handlers(FeedParserCtx *ctx)
148 {
149 XML_SetUserData(ctx->parser, (void *)ctx);
150
151 XML_SetElementHandler(ctx->parser,
152 _elparse_start_chooser,
153 _elparse_end_dummy);
154
155 XML_SetCharacterDataHandler(ctx->parser,
156 libfeed_expat_chparse);
157
158 XML_SetUnknownEncodingHandler(ctx->parser, feed_parser_unknown_encoding_handler,
159 NULL);
160 }
161
feed_writefunc(void * ptr,size_t size,size_t nmemb,void * data)162 size_t feed_writefunc(void *ptr, size_t size, size_t nmemb, void *data)
163 {
164 gint len = size * nmemb;
165 FeedParserCtx *ctx = (FeedParserCtx *)data;
166 gint status, err;
167
168 if (!ctx->feed->is_valid) {
169 /* We already know that the feed is not valid, so we won't
170 * try parsing it. Just return correct number so libcurl is
171 * happy. */
172 return len;
173 }
174
175 status = XML_Parse(ctx->parser, ptr, len, FALSE);
176
177 if( status == XML_STATUS_ERROR ) {
178 err = XML_GetErrorCode(ctx->parser);
179 printf("\nExpat: --- %s\n\n", XML_ErrorString(err));
180 ctx->feed->is_valid = FALSE;
181 }
182
183 return len;
184 }
185
feed_parser_get_attribute_value(const gchar ** attr,const gchar * name)186 gchar *feed_parser_get_attribute_value(const gchar **attr, const gchar *name)
187 {
188 guint i;
189
190 if( attr == NULL || name == NULL )
191 return NULL;
192
193 for( i = 0; attr[i] != NULL && attr[i+1] != NULL; i += 2 ) {
194 if( !strcmp( attr[i], name) )
195 return (gchar *)attr[i+1];
196 }
197
198 /* We haven't found anything. */
199 return NULL;
200 }
201
202 #define CHARSIZEUTF32 4
203
204 enum {
205 LEP_ICONV_OK,
206 LEP_ICONV_FAILED,
207 LEP_ICONV_ILSEQ,
208 LEP_ICONV_INVAL,
209 LEP_ICONV_UNKNOWN
210 };
211
giconv_utf32_char(GIConv cd,const gchar * inbuf,size_t insize,guint32 * p_value)212 static gint giconv_utf32_char(GIConv cd, const gchar *inbuf, size_t insize,
213 guint32 *p_value)
214 {
215 #ifdef HAVE_ICONV
216 size_t outsize;
217 guchar outbuf[CHARSIZEUTF32];
218 gchar *outbufp;
219 gint r;
220
221 outsize = sizeof(outbuf);
222 outbufp = (gchar *)outbuf;
223 #ifdef HAVE_ICONV_PROTO_CONST
224 r = g_iconv(cd, (const gchar **)&inbuf, &insize,
225 &outbufp, &outsize);
226 #else
227 r = g_iconv(cd, (gchar **)&inbuf, &insize,
228 &outbufp, &outsize);
229 #endif
230 if( r == -1 ) {
231 g_iconv(cd, 0, 0, 0, 0);
232 switch(errno) {
233 case EILSEQ:
234 return LEP_ICONV_ILSEQ;
235 case EINVAL:
236 return LEP_ICONV_INVAL;
237 default:
238 return LEP_ICONV_UNKNOWN;
239 }
240 } else {
241 guint32 value;
242 guint i;
243
244 if( (insize > 0) || (outsize > 0) )
245 return LEP_ICONV_FAILED;
246
247 value = 0;
248 for( i = 0; i < sizeof(outbuf); i++ ) {
249 value = (value << 8) + outbuf[i];
250 }
251 *p_value = value;
252 return LEP_ICONV_OK;
253 }
254 #else
255 return LEP_ICONV_FAILED;
256 #endif
257 }
258
feed_parser_setup_unknown_encoding(const gchar * charset,XML_Encoding * info)259 static gint feed_parser_setup_unknown_encoding(const gchar *charset,
260 XML_Encoding *info)
261 {
262 GIConv cd;
263 gint flag, r;
264 gchar buf[4];
265 guint i, j, k;
266 guint32 value;
267
268 cd = g_iconv_open("UTF-32BE", charset);
269 if( cd == (GIConv) -1 )
270 return -1;
271
272 flag = 0;
273 for( i = 0; i < 256; i++ ) {
274 /* first char */
275 buf[0] = i;
276 info->map[i] = 0;
277 r = giconv_utf32_char(cd, buf, 1, &value);
278 if( r == LEP_ICONV_OK) {
279 info->map[i] = value;
280 } else if( r != LEP_ICONV_INVAL ) {
281 } else {
282 for( j = 0; j < 256; j++ ) {
283 /* second char */
284 buf[1] = j;
285 r = giconv_utf32_char(cd, buf, 2, &value);
286 if( r == LEP_ICONV_OK ) {
287 flag = 1;
288 info->map[i] = -2;
289 } else if( r != LEP_ICONV_INVAL ) {
290 } else {
291 for( k = 0; k < 256; k++ ) {
292 /* third char */
293 buf[2] = k;
294 r = giconv_utf32_char(cd, buf, 3, &value);
295 if( r == LEP_ICONV_OK) {
296 info->map[i] = -3;
297 }
298 }
299 }
300 }
301 }
302 }
303
304 g_iconv_close(cd);
305
306 return flag;
307 }
308
309 struct FeedParserUnknownEncoding {
310 gchar *charset;
311 GIConv cd;
312 };
313
feed_parser_unknown_encoding_convert(void * data,const gchar * s)314 static gint feed_parser_unknown_encoding_convert(void *data, const gchar *s)
315 {
316 gint r;
317 struct FeedParserUnknownEncoding *enc_data;
318 size_t insize;
319 guint32 value;
320
321 enc_data = data;
322 insize = 4;
323
324 if( s == NULL )
325 return -1;
326
327 r = giconv_utf32_char(enc_data->cd, s, insize, &value);
328 if( r != LEP_ICONV_OK )
329 return -1;
330
331 return 0;
332 }
333
feed_parser_unknown_encoding_data_free(void * data)334 static void feed_parser_unknown_encoding_data_free(void *data)
335 {
336 struct FeedParserUnknownEncoding *enc_data;
337
338 enc_data = data;
339 free(enc_data->charset);
340 g_iconv_close(enc_data->cd);
341 free(enc_data);
342 }
343
feed_parser_unknown_encoding_handler(void * encdata,const XML_Char * name,XML_Encoding * info)344 int feed_parser_unknown_encoding_handler(void *encdata, const XML_Char *name,
345 XML_Encoding *info)
346 {
347 GIConv cd;
348 struct FeedParserUnknownEncoding *data;
349 int result;
350
351 result = feed_parser_setup_unknown_encoding(name, info);
352 if( result == 0 ) {
353 info->data = NULL;
354 info->convert = NULL;
355 info->release = NULL;
356 return XML_STATUS_OK;
357 }
358
359 cd = g_iconv_open("UTF-32BE", name);
360 if( cd == (GIConv)-1 )
361 return XML_STATUS_ERROR;
362
363 data = malloc( sizeof(*data) );
364 if( data == NULL ) {
365 g_iconv_close(cd);
366 return XML_STATUS_ERROR;
367 }
368
369 data->charset = strdup(name);
370 if( data->charset == NULL ) {
371 free(data);
372 g_iconv_close(cd);
373 return XML_STATUS_ERROR;
374 }
375
376 data->cd = cd;
377 info->data = data;
378 info->convert = feed_parser_unknown_encoding_convert;
379 info->release = feed_parser_unknown_encoding_data_free;
380
381 return XML_STATUS_OK;
382 }
383