1 /*
2 rss.h - RSS (and Atom) parser and generator (using libxml2)
3
4 Copyright (c) 2012-2018 Nikola Kolev <koue@chaosophia.net>
5 Copyright (c) 2006 NoisyB
6
7
8 This program is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 This program is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; if not, write to the Free Software
20 Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
21 */
22
23 #include <stdio.h>
24 #include <string.h>
25 #include <libxml/parser.h>
26 #include <libxml/tree.h>
27 #include <time.h>
28 #include <unistd.h>
29
30 #include "rss.h"
31
32 time_t
strptime2(char * s)33 strptime2(char *s)
34 {
35 int i = 0;
36 char y[100], m[100], d[100];
37 char h[100], min[100];
38 struct tm time_tag;
39 time_t t = time (0);
40 const char *month_s[] = {"Jan", "Feb", "Mar", "Apr", "May", "Jun",
41 "Jul", "Aug", "Sep", "Oct", "Nov", "Dec", NULL};
42
43 *y = *m = *d = *h = *min = 0;
44
45 if (s[10] == 'T') { // YYYY-MM-DDT00:00+00:00
46 sscanf (s, " %4s-%2s-%2sT%2s:%2s", y, m, d, h, min);
47 } else if (s[3] == ',' && s[4] == ' ') {// Mon, 31 Jul 2006 15:05:00 GMT
48 sscanf (s + 5, "%2s %s %4s %2s:%2s", d, m, y, h, min);
49 for (i = 0; month_s[i]; i++)
50 if (!strcasecmp (m, month_s[i])) {
51 sprintf (m, "%d", i + 1);
52 break;
53 }
54 } else if (s[4] == '-' && s[7] == '-') { // 2006-07-19
55 sscanf (s, "%4s-%2s-%2s", y, m, d);
56 } else { // YYYYMMDDTHHMMSS
57 // sscanf (s, " %4s%2s%2sT", y, m, d);
58 }
59 free(s);
60
61 memset(&time_tag, 0, sizeof(struct tm));
62
63 if (*y)
64 time_tag.tm_year = strtol (y, NULL, 10) - 1900;
65 if (*m)
66 time_tag.tm_mon = strtol (m, NULL, 10) - 1;
67 if (*d)
68 time_tag.tm_mday = strtol (d, NULL, 10);
69 if (*h)
70 time_tag.tm_hour = strtol (h, NULL, 10);
71 if (*min)
72 time_tag.tm_min = strtol (min, NULL, 10);
73
74 t = mktime(&time_tag);
75
76 return (t);
77 }
78
79
80 unsigned char *
xml_get_value(xmlNode * n,const char * name)81 xml_get_value(xmlNode *n, const char *name)
82 {
83 if (n)
84 if (xmlHasProp(n, (const unsigned char *)name))
85 return (xmlGetProp(n, (const unsigned char *)name));
86 return (NULL);
87 }
88
89 void
rss_st_rss_t_sanity_check(st_rss_t * rss)90 rss_st_rss_t_sanity_check(st_rss_t *rss)
91 {
92 for (int i = 0; i < rss->item_count; i++)
93 printf("pos: %d\n"
94 "title: %s\n"
95 "url: %s\n"
96 "date: %ld\n"
97 "desc: %s\n\n",
98 i,
99 rss->item[i].title,
100 rss->item[i].url,
101 (long)rss->item[i].date,
102 rss->item[i].desc);
103 printf("rss->item_count: %d\n\n", rss->item_count);
104 }
105
106 static void
rss_read_copy(char * d,xmlDoc * doc,xmlNode * n)107 rss_read_copy(char *d, xmlDoc *doc, xmlNode *n)
108 {
109 dmsg(1, "%s: start", __func__);
110 char *p = (char *)xmlNodeGetContent(n);
111 if (p)
112 strncpy(d, p, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
113 else
114 *d = 0;
115 dmsg(1, "%s: p: %s", __func__, p);
116 if (p)
117 free(p);
118 dmsg(1, "%s: end", __func__);
119 }
120
121 int
rss_close(st_rss_t * rss)122 rss_close(st_rss_t *rss)
123 {
124 dmsg(1, __func__);
125 if (rss) {
126 free(rss);
127 rss = NULL;
128 }
129
130 return (0);
131 }
132
133 static void
rss_channel(st_rss_t * rss,xmlDoc * doc,xmlNode * pnode)134 rss_channel(st_rss_t *rss, xmlDoc *doc, xmlNode *pnode)
135 {
136 dmsg(1, "%s: start", __func__);
137 while (pnode) {
138 dmsg(1, "%s: pnode->name: %s", __func__, (char *) pnode->name);
139 if (strcmp((char *)pnode->name, "title") == 0)
140 rss_read_copy(rss->title, doc, pnode->xmlChildrenNode);
141 else if (strcmp((char *)pnode->name, "description") == 0)
142 rss_read_copy (rss->desc, doc, pnode->xmlChildrenNode);
143 else if (strcmp((char *)pnode->name, "date") == 0||
144 strcmp((char *)pnode->name, "pubDate") == 0 ||
145 strcmp((char *) pnode->name, "dc:date") == 0)
146 rss->date = strptime2((char *)xmlNodeListGetString(pnode->xmlChildrenNode->doc, pnode->xmlChildrenNode, 1));
147
148 pnode = pnode->next;
149 }
150 dmsg(1, "%s: end", __func__);
151 }
152
153 static void
rss_entry(st_rss_item_t * item,xmlDoc * doc,xmlNode * pnode)154 rss_entry(st_rss_item_t *item, xmlDoc *doc, xmlNode *pnode)
155 {
156 char link[RSSMAXBUFSIZE], guid[RSSMAXBUFSIZE];
157 char *p = NULL, *href = NULL;
158
159 *link = *guid = 0;
160
161 dmsg(1, "%s: start", __func__);
162 while (pnode) {
163 while (pnode && xmlIsBlankNode(pnode))
164 pnode = pnode->next;
165
166 if (pnode == NULL)
167 break;
168
169 dmsg(1, "%s\n", (char *)pnode->name);
170 if (strcmp((char *)pnode->name, "title") == 0) {
171 rss_read_copy(item->title, doc, pnode->xmlChildrenNode);
172 } else if (strcmp((char *)pnode->name, "link") == 0) {
173 p = (char *)xml_get_value(pnode, "rel"); // atom
174 if (p) {
175 if (strcmp(p, "alternate") == 0) {
176 href = (char *) xml_get_value(pnode, "href");
177 strncpy(link, href, RSSMAXBUFSIZE)[RSSMAXBUFSIZE-1] = 0;
178 free(href);
179 }
180 free(p);
181 } else {
182 rss_read_copy(link, doc, pnode->xmlChildrenNode); //rss
183 }
184 } else if (strcmp((char *)pnode->name, "guid") == 0 && (!(*link))) {
185 rss_read_copy(guid, doc, pnode->xmlChildrenNode);
186 } else if (!strcmp((char *)pnode->name, "description")) {
187 rss_read_copy(item->desc, doc, pnode->xmlChildrenNode);
188 } else if (!strcmp((char *)pnode->name, "content")) {
189 rss_read_copy(item->desc, doc, pnode->xmlChildrenNode);
190 } else if (!strcasecmp((char *)pnode->name, "date") ||
191 !strcasecmp((char *)pnode->name, "pubDate") ||
192 !strcasecmp((char *)pnode->name, "dc:date") ||
193 !strcmp((char *)pnode->name, "modified") ||
194 !strcmp((char *)pnode->name, "updated") ||
195 !strcasecmp((char *)pnode->name, "cropDate")) {
196 item->date = strptime2((char *)xmlNodeListGetString(pnode->xmlChildrenNode->doc, pnode->xmlChildrenNode, 1));
197 }
198
199 pnode = pnode->next;
200 }
201
202 // some feeds use the guid tag for the link
203 if (*link)
204 strlcpy(item->url, link, sizeof(item->url));
205 else if (*guid)
206 strlcpy(item->url, guid, sizeof(item->url));
207 else
208 *(item->url) = 0;
209 dmsg(1, "%s: end", __func__);
210 }
211
212 static void
rss_head(st_rss_t * rss,xmlDoc * doc,xmlNode * node)213 rss_head(st_rss_t *rss, xmlDoc *doc, xmlNode *node)
214 {
215 dmsg(1, "%s: start", __func__);
216 while (node) {
217 while (node && xmlIsBlankNode(node))
218 node = node->next;
219
220 if (node == NULL)
221 break;
222
223 dmsg(1, "%s: node->name: %s", __func__, (char *)node->name);
224 if (!strcmp((char *)node->name, "title"))
225 rss_read_copy(rss->title, doc, node->xmlChildrenNode);
226 else if (!strcmp((char *)node->name, "description"))
227 rss_read_copy(rss->desc, doc, node->xmlChildrenNode);
228 else if (!strcmp((char *)node->name, "date") ||
229 !strcmp((char *)node->name, "pubDate") ||
230 !strcmp((char *)node->name, "modified") ||
231 !strcmp((char *)node->name, "updated") ||
232 !strcmp((char *)node->name, "dc:date"))
233 rss->date = strptime2((char *)xmlNodeListGetString(node->xmlChildrenNode->doc, node->xmlChildrenNode, 1));
234 else if (!strcmp((char *)node->name, "channel") && (rss->version == RSS_V1_0)) {
235 rss_channel(rss, doc, node->xmlChildrenNode);
236 } else if (!strcmp((char *)node->name, "item") ||
237 !strcmp((char *)node->name, "entry")) {
238 rss_entry(&rss->item[rss->item_count], doc, node->xmlChildrenNode);
239 rss->item_count++;
240 if (rss->item_count == RSSMAXITEM)
241 break;
242 }
243 node = node->next;
244 }
245 dmsg(1, "%s: end", __func__);
246 }
247
248 static st_rss_t *
rss_parse(st_rss_t * rss)249 rss_parse(st_rss_t *rss)
250 {
251 xmlDoc *doc;
252 xmlNode *node;
253
254 dmsg(1, "%s: start", __func__);
255 if ((doc = xmlParseFile(rss->url)) == NULL) {
256 fprintf(stderr, "%s: cannot read %s\n", __func__, rss->url);
257 return (NULL);
258 } else if ((node = xmlDocGetRootElement(doc)) == NULL) {
259 fprintf (stderr, "%s: empty document %s\n", __func__, rss->url);
260 xmlFreeDoc(doc);
261 return (NULL);
262 }
263
264 dmsg(1, "%s: rss->url %s", __func__, rss->url);
265 dmsg(1, "%s: node->name: %s", __func__, (char *)node->name);
266
267 node = node->xmlChildrenNode;
268 while (node && xmlIsBlankNode(node))
269 node = node->next;
270
271 if (node == NULL) {
272 fprintf(stderr, "%s: bad document %s\n", __func__, rss->url);
273 xmlFreeDoc(doc);
274 return (NULL);
275 } else if (rss->version < ATOM_V0_1) {
276 if (strcmp((char *)node->name, "channel")) {
277 fprintf (stderr, "%s: bad document: channel missing %s\n",
278 __func__, rss->url);
279 return (NULL);
280 } else if (rss->version != RSS_V1_0) // document is RSS
281 node = node->xmlChildrenNode;
282 }
283
284 rss_head(rss, doc, node);
285 if (debug > 1) {
286 rss_st_rss_t_sanity_check(rss);
287 fflush(stdout);
288 }
289
290 xmlFreeDoc(doc);
291 dmsg(1, "%s: end", __func__);
292 return (rss);
293 }
294
295 int
rss_demux(const char * fname)296 rss_demux(const char *fname)
297 {
298 xmlDoc *doc = NULL;
299 xmlNode *node = NULL;
300 int version = -1;
301 char *p = NULL;
302
303 dmsg(1, "%s: start %s", __func__, fname);
304 if ((doc = xmlParseFile(fname)) == NULL) {
305 fprintf(stderr, "%s: cannot read %s\n", __func__, fname);
306 goto done;
307 }
308
309 if ((node = xmlDocGetRootElement(doc)) == NULL)
310 goto done;
311 else if ((char *)node->name == NULL)
312 goto done;
313 else if (strcmp((char *)node->name, "html") == 0) // not xml
314 goto done;
315 else if (strcmp((char *)node->name, "feed") == 0) {
316 version = ATOM_V0_1; //default
317 if ((p = (char *)xml_get_value(node, "version")) == NULL)
318 goto done;
319 else if (strcmp(p, "0.3") == 0)
320 version = ATOM_V0_3;
321 else if (strcmp(p, "0.2") == 0)
322 version = ATOM_V0_2;
323 } else if (strcmp((char *)node->name, "rss") == 0) {
324 if ((p = (char *)xml_get_value(node, "version")) == NULL)
325 goto done;
326 else if (strcmp(p, "0.91") == 0)
327 version = RSS_V0_91;
328 else if (strcmp(p, "0.92") == 0)
329 version = RSS_V0_92;
330 else if (strcmp(p, "0.93") == 0)
331 version = RSS_V0_93;
332 else if (strcmp(p, "0.94") == 0)
333 version = RSS_V0_94;
334 else if ((strcmp(p, "2") == 0) || (strcmp(p, "2.0") == 0) ||
335 (strcmp(p, "2.00") == 0))
336 version = RSS_V2_0;
337 } else if ((strcmp((char *)node->name, "rdf") == 0) ||
338 (strcmp((char *)node->name, "RDF") == 0)) {
339 version = RSS_V1_0;
340 }
341 done:
342 if (p != NULL)
343 free(p);
344 xmlFreeDoc(doc);
345 dmsg(1, "%s: end", __func__);
346 return (version);
347 }
348
349 st_rss_t *
rss_open(const char * fname)350 rss_open(const char *fname)
351 {
352 st_rss_t *rss = NULL;
353
354 dmsg(1, "%s: start", __func__);
355 dmsg(1, "%s: %s", __func__, fname);
356 if ((rss = malloc(sizeof(st_rss_t))) == NULL)
357 return (NULL);
358
359 memset(rss, 0, sizeof(st_rss_t));
360 strncpy(rss->url, fname, RSSMAXBUFSIZE)[RSSMAXBUFSIZE - 1] = 0;
361 rss->item_count = 0;
362
363 rss->version = rss_demux(fname);
364
365 if (rss->version == -1) {
366 fprintf(stderr, "ERROR: uknown feed format %s.\n", rss->url);
367 return (NULL);
368 }
369 dmsg(1, "%s: end", __func__);
370 return rss_parse(rss);
371 }
372
373 /* debug message out */
374 void
dmsg(int verbose,const char * fmt,...)375 dmsg(int verbose, const char *fmt, ...)
376 {
377 if (debug > verbose) {
378 va_list ap;
379 time_t t = time(NULL);
380 struct tm *tm = gmtime(&t);
381 fprintf(stdout, "%4.4d.%2.2d.%2.2d %2.2d:%2.2d:%2.2d ",
382 tm->tm_year + 1900, tm->tm_mon + 1, tm->tm_mday, tm->tm_hour,
383 tm->tm_min, tm->tm_sec);
384 va_start(ap, fmt);
385 vfprintf(stdout, fmt, ap);
386 va_end(ap);
387 fprintf(stdout, "\n");
388 fflush(stdout);
389 }
390 }
391