1 /*
2 rss.h - RSS (and Atom) parser and generator (using libxml2)
3 
4 Copyright (c) 2012-2018 Nikola Kolev <koue@chaosophia.net>
5 Copyright (c) 2006 NoisyB
6 
7 
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12 
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
16 GNU General Public License for more details.
17 
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22 
23 #include <stdio.h>
24 #include <string.h>
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <time.h>
28 #include <unistd.h>
29 
30 #include "rss.h"
31 
32 time_t
strptime2(char * s)33 strptime2(char *s)
34 {
35 	int i = 0;
36 	char y[100], m[100], d[100];
37 	char h[100], min[100];
38 	struct tm time_tag;
39 	time_t t = time (0);
40 	const char *month_s[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
41 				"Jul", "Aug", "Sep", "Oct", "Nov", "Dec", NULL};
42 
43 	*y = *m = *d = *h = *min = 0;
44 
45 	if (s[10] == 'T') {                     // YYYY-MM-DDT00:00+00:00
46 		sscanf (s, " %4s-%2s-%2sT%2s:%2s", y, m, d, h, min);
47 	} else if (s[3] == ',' && s[4] == ' ') {// Mon, 31 Jul 2006 15:05:00 GMT
48 		sscanf (s + 5, "%2s %s %4s %2s:%2s", d, m, y, h, min);
49 		for (i = 0; month_s[i]; i++)
50 		if (!strcasecmp (m, month_s[i])) {
51 			sprintf (m, "%d", i + 1);
52 			break;
53 		}
54 	} else if (s[4] == '-' && s[7] == '-') {	// 2006-07-19
55 		sscanf (s, "%4s-%2s-%2s", y, m, d);
56 	} else {					// YYYYMMDDTHHMMSS
57 		// sscanf (s, " %4s%2s%2sT", y, m, d);
58 	}
59 	free(s);
60 
61 	memset(&time_tag, 0, sizeof(struct tm));
62 
63 	if (*y)
64 		time_tag.tm_year = strtol (y, NULL, 10) - 1900;
65 	if (*m)
66 		time_tag.tm_mon = strtol (m, NULL, 10) - 1;
67 	if (*d)
68 		time_tag.tm_mday = strtol (d, NULL, 10);
69 	if (*h)
70 		time_tag.tm_hour = strtol (h, NULL, 10);
71 	if (*min)
72 		time_tag.tm_min = strtol (min, NULL, 10);
73 
74 	t = mktime(&time_tag);
75 
76 	return (t);
77 }
78 
79 
80 unsigned char *
xml_get_value(xmlNode * n,const char * name)81 xml_get_value(xmlNode *n, const char *name)
82 {
83 	if (n)
84 		if (xmlHasProp(n, (const unsigned char *)name))
85 			return (xmlGetProp(n, (const unsigned char *)name));
86 	return (NULL);
87 }
88 
89 void
rss_st_rss_t_sanity_check(st_rss_t * rss)90 rss_st_rss_t_sanity_check(st_rss_t *rss)
91 {
92 	for (int i = 0; i < rss->item_count; i++)
93 		printf("pos: %d\n"
94 			"title: %s\n"
95 			"url: %s\n"
96 			"date: %ld\n"
97 			"desc: %s\n\n",
98 		i,
99 		rss->item[i].title,
100 		rss->item[i].url,
101 		(long)rss->item[i].date,
102 		rss->item[i].desc);
103 	printf("rss->item_count: %d\n\n", rss->item_count);
104 }
105 
106 static void
rss_read_copy(char * d,xmlDoc * doc,xmlNode * n)107 rss_read_copy(char *d, xmlDoc *doc, xmlNode *n)
108 {
109 	dmsg(1, "%s: start", __func__);
110 	char *p = (char *)xmlNodeGetContent(n);
111 	if (p)
112 		strncpy(d, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
113 	else
114 		*d = 0;
115 	dmsg(1, "%s: p: %s", __func__, p);
116 	if (p)
117 		free(p);
118 	dmsg(1, "%s: end", __func__);
119 }
120 
121 int
rss_close(st_rss_t * rss)122 rss_close(st_rss_t *rss)
123 {
124 	dmsg(1, __func__);
125 	if (rss) {
126 		free(rss);
127 		rss = NULL;
128 	}
129 
130 	return (0);
131 }
132 
133 static void
rss_channel(st_rss_t * rss,xmlDoc * doc,xmlNode * pnode)134 rss_channel(st_rss_t *rss, xmlDoc *doc, xmlNode *pnode)
135 {
136 	dmsg(1, "%s: start", __func__);
137 	while (pnode) {
138 		dmsg(1, "%s: pnode->name: %s", __func__, (char *) pnode->name);
139 		if (strcmp((char *)pnode->name, "title") == 0)
140 			rss_read_copy(rss->title, doc, pnode->xmlChildrenNode);
141 		else if (strcmp((char *)pnode->name, "description") == 0)
142 			rss_read_copy (rss->desc, doc, pnode->xmlChildrenNode);
143 		else if (strcmp((char *)pnode->name, "date") == 0||
144 		    strcmp((char *)pnode->name, "pubDate") == 0 ||
145 		    strcmp((char *) pnode->name, "dc:date") == 0)
146 			rss->date = strptime2((char *)xmlNodeListGetString(pnode->xmlChildrenNode->doc, pnode->xmlChildrenNode, 1));
147 
148 		pnode = pnode->next;
149 	}
150 	dmsg(1, "%s: end", __func__);
151 }
152 
153 static void
rss_entry(st_rss_item_t * item,xmlDoc * doc,xmlNode * pnode)154 rss_entry(st_rss_item_t *item, xmlDoc *doc, xmlNode *pnode)
155 {
156 	char link[RSSMAXBUFSIZE], guid[RSSMAXBUFSIZE];
157 	char *p = NULL, *href = NULL;
158 
159 	*link = *guid = 0;
160 
161 	dmsg(1, "%s: start", __func__);
162 	while (pnode) {
163 		while (pnode && xmlIsBlankNode(pnode))
164 			pnode = pnode->next;
165 
166 		if (pnode == NULL)
167 			break;
168 
169 		dmsg(1, "%s\n", (char *)pnode->name);
170 		if (strcmp((char *)pnode->name, "title") == 0) {
171 			rss_read_copy(item->title, doc, pnode->xmlChildrenNode);
172 		} else if (strcmp((char *)pnode->name, "link") == 0) {
173 			p = (char *)xml_get_value(pnode, "rel");	// atom
174 			if (p) {
175 				if (strcmp(p, "alternate") == 0) {
176 					href = (char *) xml_get_value(pnode, "href");
177 					strncpy(link, href, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
178 					free(href);
179 				}
180 				free(p);
181 			} else {
182 				rss_read_copy(link, doc, pnode->xmlChildrenNode); //rss
183 			}
184 		} else if (strcmp((char *)pnode->name, "guid") == 0 && (!(*link))) {
185 			rss_read_copy(guid, doc, pnode->xmlChildrenNode);
186 		} else if (!strcmp((char *)pnode->name, "description")) {
187 			rss_read_copy(item->desc, doc, pnode->xmlChildrenNode);
188 		} else if (!strcmp((char *)pnode->name, "content")) {
189 			rss_read_copy(item->desc, doc, pnode->xmlChildrenNode);
190 		} else if (!strcasecmp((char *)pnode->name, "date") ||
191 		    !strcasecmp((char *)pnode->name, "pubDate") ||
192 		    !strcasecmp((char *)pnode->name, "dc:date") ||
193 		    !strcmp((char *)pnode->name, "modified") ||
194 		    !strcmp((char *)pnode->name, "updated") ||
195 		    !strcasecmp((char *)pnode->name, "cropDate")) {
196 			item->date = strptime2((char *)xmlNodeListGetString(pnode->xmlChildrenNode->doc, pnode->xmlChildrenNode, 1));
197 		}
198 
199 		pnode = pnode->next;
200 	}
201 
202 	// some feeds use the guid tag for the link
203 	if (*link)
204 		strlcpy(item->url, link, sizeof(item->url));
205 	else if (*guid)
206 		strlcpy(item->url, guid, sizeof(item->url));
207 	else
208 		*(item->url) = 0;
209 	dmsg(1, "%s: end", __func__);
210 }
211 
212 static void
rss_head(st_rss_t * rss,xmlDoc * doc,xmlNode * node)213 rss_head(st_rss_t *rss, xmlDoc *doc, xmlNode *node)
214 {
215 	dmsg(1, "%s: start", __func__);
216 	while (node) {
217 		while (node && xmlIsBlankNode(node))
218 			node = node->next;
219 
220 		if (node == NULL)
221 			break;
222 
223 		dmsg(1, "%s: node->name: %s", __func__, (char *)node->name);
224 		if (!strcmp((char *)node->name, "title"))
225 			rss_read_copy(rss->title, doc, node->xmlChildrenNode);
226 		else if (!strcmp((char *)node->name, "description"))
227 			rss_read_copy(rss->desc, doc, node->xmlChildrenNode);
228 		else if (!strcmp((char *)node->name, "date") ||
229 		    !strcmp((char *)node->name, "pubDate") ||
230 		    !strcmp((char *)node->name, "modified") ||
231 		    !strcmp((char *)node->name, "updated") ||
232 		    !strcmp((char *)node->name, "dc:date"))
233 			rss->date = strptime2((char *)xmlNodeListGetString(node->xmlChildrenNode->doc, node->xmlChildrenNode, 1));
234 		else if (!strcmp((char *)node->name, "channel") && (rss->version == RSS_V1_0)) {
235 			rss_channel(rss, doc, node->xmlChildrenNode);
236 		} else if (!strcmp((char *)node->name, "item") ||
237 		    !strcmp((char *)node->name, "entry")) {
238 			rss_entry(&rss->item[rss->item_count], doc, node->xmlChildrenNode);
239 			rss->item_count++;
240 			if (rss->item_count == RSSMAXITEM)
241 				break;
242 		}
243 	node = node->next;
244 	}
245 	dmsg(1, "%s: end", __func__);
246 }
247 
248 static st_rss_t *
rss_parse(st_rss_t * rss)249 rss_parse(st_rss_t *rss)
250 {
251 	xmlDoc *doc;
252 	xmlNode *node;
253 
254 	dmsg(1, "%s: start", __func__);
255 	if ((doc = xmlParseFile(rss->url)) == NULL) {
256 		fprintf(stderr, "%s: cannot read %s\n", __func__, rss->url);
257 		return (NULL);
258 	} else if ((node = xmlDocGetRootElement(doc)) == NULL) {
259 		fprintf (stderr, "%s: empty document %s\n", __func__, rss->url);
260 		xmlFreeDoc(doc);
261 		return (NULL);
262 	}
263 
264 	dmsg(1, "%s: rss->url %s", __func__, rss->url);
265 	dmsg(1, "%s: node->name: %s", __func__, (char *)node->name);
266 
267 	node = node->xmlChildrenNode;
268 	while (node && xmlIsBlankNode(node))
269 		node = node->next;
270 
271 	if (node == NULL) {
272 		fprintf(stderr, "%s: bad document %s\n", __func__, rss->url);
273 		xmlFreeDoc(doc);
274 		return (NULL);
275 	} else if (rss->version < ATOM_V0_1) {
276 		if (strcmp((char *)node->name, "channel")) {
277 			fprintf (stderr, "%s: bad document: channel missing %s\n",
278 							__func__, rss->url);
279 			return (NULL);
280 		} else if (rss->version != RSS_V1_0) // document is RSS
281 			node = node->xmlChildrenNode;
282 	}
283 
284 	rss_head(rss, doc, node);
285 	if (debug > 1) {
286 		rss_st_rss_t_sanity_check(rss);
287 		fflush(stdout);
288 	}
289 
290 	xmlFreeDoc(doc);
291 	dmsg(1, "%s: end", __func__);
292 	return (rss);
293 }
294 
295 int
rss_demux(const char * fname)296 rss_demux(const char *fname)
297 {
298 	xmlDoc *doc = NULL;
299 	xmlNode *node = NULL;
300 	int version = -1;
301 	char *p = NULL;
302 
303 	dmsg(1, "%s: start %s", __func__, fname);
304 	if ((doc = xmlParseFile(fname)) == NULL) {
305 		fprintf(stderr, "%s: cannot read %s\n", __func__, fname);
306 		goto done;
307 	}
308 
309 	if ((node = xmlDocGetRootElement(doc)) == NULL)
310 		goto done;
311 	else if ((char *)node->name == NULL)
312 		goto done;
313 	else if (strcmp((char *)node->name, "html") == 0) // not xml
314 		goto done;
315 	else if (strcmp((char *)node->name, "feed") == 0) {
316 		version = ATOM_V0_1;	//default
317 		if ((p = (char *)xml_get_value(node, "version")) == NULL)
318 			goto done;
319 		else if (strcmp(p, "0.3") == 0)
320 			version = ATOM_V0_3;
321 		else if (strcmp(p, "0.2") == 0)
322 			version = ATOM_V0_2;
323 	} else if (strcmp((char *)node->name, "rss") == 0) {
324 		if ((p = (char *)xml_get_value(node, "version")) == NULL)
325 			goto done;
326 		else if (strcmp(p, "0.91") == 0)
327 			version = RSS_V0_91;
328 		else if (strcmp(p, "0.92") == 0)
329 			version = RSS_V0_92;
330 		else if (strcmp(p, "0.93") == 0)
331 			version = RSS_V0_93;
332 		else if (strcmp(p, "0.94") == 0)
333 			version = RSS_V0_94;
334 		else if ((strcmp(p, "2") == 0) || (strcmp(p, "2.0") == 0) ||
335 			    (strcmp(p, "2.00") == 0))
336 			version = RSS_V2_0;
337 	} else if ((strcmp((char *)node->name, "rdf") == 0) ||
338 		    (strcmp((char *)node->name, "RDF") == 0)) {
339 		version = RSS_V1_0;
340 	}
341 done:
342 	if (p != NULL)
343 		free(p);
344 	xmlFreeDoc(doc);
345 	dmsg(1, "%s: end", __func__);
346 	return (version);
347 }
348 
349 st_rss_t *
rss_open(const char * fname)350 rss_open(const char *fname)
351 {
352 	st_rss_t *rss = NULL;
353 
354 	dmsg(1, "%s: start", __func__);
355 	dmsg(1, "%s: %s", __func__, fname);
356 	if ((rss = malloc(sizeof(st_rss_t))) == NULL)
357 		return (NULL);
358 
359 	memset(rss, 0, sizeof(st_rss_t));
360 	strncpy(rss->url, fname, RSSMAXBUFSIZE)[RSSMAXBUFSIZE - 1] = 0;
361 	rss->item_count = 0;
362 
363 	rss->version = rss_demux(fname);
364 
365 	if (rss->version == -1) {
366 		fprintf(stderr, "ERROR: uknown feed format %s.\n", rss->url);
367 		return (NULL);
368 	}
369 	dmsg(1, "%s: end", __func__);
370 	return rss_parse(rss);
371 }
372 
373 /* debug message out */
374 void
dmsg(int verbose,const char * fmt,...)375 dmsg(int verbose, const char *fmt, ...)
376 {
377 	if (debug > verbose) {
378 		va_list ap;
379 		time_t t = time(NULL);
380 		struct tm *tm = gmtime(&t);
381 		fprintf(stdout, "%4.4d.%2.2d.%2.2d %2.2d:%2.2d:%2.2d ",
382 		    tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour,
383 		    tm->tm_min, tm->tm_sec);
384 		va_start(ap, fmt);
385 		vfprintf(stdout, fmt, ap);
386 		va_end(ap);
387 		fprintf(stdout, "\n");
388 		fflush(stdout);
389 	}
390 }
391