1 /*
2   Copyright (C) 2005-2017 Marius L. Jøhndal
3 
4   This library is free software; you can redistribute it and/or
5   modify it under the terms of the GNU Lesser General Public
6   License as published by the Free Software Foundation; either
7   version 2.1 of the License, or (at your option) any later version.
8 
9   This library is distributed in the hope that it will be useful,
10   but WITHOUT ANY WARRANTY; without even the implied warranty of
11   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
12   Lesser General Public License for more details.
13 
14   You should have received a copy of the GNU Lesser General Public
15   License along with this library; if not, write to the Free Software
16   Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
17 
18 */
19 
20 #ifdef HAVE_CONFIG_H
21 #include "config.h"
22 #endif /* HAVE_CONFIG_H */
23 
24 #include <string.h>
25 #include <sys/types.h>
26 #include <sys/stat.h>
27 #include <glib.h>
28 #include <glib/gprintf.h>
29 #include "libxmlutil.h"
30 #include "urlget.h"
31 #include "channel.h"
32 #include "rss.h"
33 #include "utils.h"
34 #include "progress.h"
35 #include "filenames.h"
36 
37 static int _enclosure_pattern_match(enclosure_filter *filter,
38                                     const enclosure *enclosure);
39 
_enclosure_iterator(const void * user_data,int i,const xmlNode * node)40 static void _enclosure_iterator(const void *user_data, int i, const xmlNode *node)
41 {
42   const char *downloadtime;
43 
44   channel *c = (channel *)user_data;
45 
46   downloadtime = libxmlutil_attr_as_string(node, "downloadtime");
47 
48   if (downloadtime)
49     downloadtime = g_strdup(downloadtime);
50   else
51     downloadtime = get_rfc822_time();
52 
53   g_hash_table_insert(c->downloaded_enclosures,
54                       (gpointer)libxmlutil_attr_as_string(node, "url"),
55                       (gpointer)downloadtime);
56 }
57 
channel_new(const char * url,const char * channel_file,const char * spool_directory,const char * filename_pattern,int resume)58 channel *channel_new(const char *url, const char *channel_file,
59                      const char *spool_directory,
60                      const char *filename_pattern,
61                      int resume)
62 {
63   channel *c;
64   xmlDocPtr doc;
65   xmlNode *root_element = NULL;
66   const char *s;
67 
68   c = (channel *)malloc(sizeof(struct _channel));
69   c->url = g_strdup(url);
70   c->channel_filename = g_strdup(channel_file);
71   c->spool_directory = g_strdup(spool_directory);
72   c->filename_pattern = g_strdup(filename_pattern);
73   //  c->resume = resume;
74   c->rss_last_fetched = NULL;
75   c->downloaded_enclosures = g_hash_table_new_full(g_str_hash, g_str_equal, NULL, g_free);
76 
77   if (g_file_test(c->channel_filename, G_FILE_TEST_EXISTS)) {
78     doc = xmlReadFile(c->channel_filename, NULL, 0);
79 
80     if (!doc) {
81       g_fprintf(stderr, "Error parsing channel file %s.\n", c->channel_filename);
82       return NULL;
83     }
84 
85     root_element = xmlDocGetRootElement(doc);
86 
87     if (!root_element)  {
88       xmlFreeDoc(doc);
89 
90       g_fprintf(stderr, "Error parsing channel file %s.\n", c->channel_filename);
91       return NULL;
92     }
93 
94     /* Fetch channel attributes. */
95     s = libxmlutil_attr_as_string(root_element, "rsslastfetched");
96 
97     if (s)
98       c->rss_last_fetched = g_strdup(s);
99 
100     /* Iterate encolsure elements. */
101     libxmlutil_iterate_by_tag_name(root_element, "enclosure", c, _enclosure_iterator);
102 
103     xmlFreeDoc(doc);
104   }
105 
106   return c;
107 }
108 
_cast_channel_save_downloaded_enclosure(gpointer key,gpointer value,gpointer user_data)109 static void _cast_channel_save_downloaded_enclosure(gpointer key, gpointer value,
110                                                     gpointer user_data)
111 {
112   FILE *f = (FILE *)user_data;
113   gchar *escaped_key = g_markup_escape_text(key, -1);
114 
115   if (value)
116     g_fprintf(f, "  <enclosure url=\"%s\" downloadtime=\"%s\"/>\n",
117               escaped_key, (gchar *)value);
118   else
119     g_fprintf(f, "  <enclosure url=\"%s\"/>\n", escaped_key);
120 
121   g_free(escaped_key);
122 }
123 
_cast_channel_save_channel(FILE * f,gpointer user_data,int debug)124 static int _cast_channel_save_channel(FILE *f, gpointer user_data, int debug)
125 {
126   channel *c = (channel *)user_data;
127 
128   g_fprintf(f, "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n");
129 
130   if (c->rss_last_fetched)
131     g_fprintf(f, "<channel version=\"1.0\" rsslastfetched=\"%s\">\n", c->rss_last_fetched);
132   else
133     g_fprintf(f, "<channel version=\"1.0\">\n");
134 
135   g_hash_table_foreach(c->downloaded_enclosures, _cast_channel_save_downloaded_enclosure, f);
136 
137   g_fprintf(f, "</channel>\n");
138 
139   return 0;
140 }
141 
_cast_channel_save(channel * c,int debug)142 static void _cast_channel_save(channel *c, int debug)
143 {
144   write_by_temporary_file(c->channel_filename, _cast_channel_save_channel, c, NULL, debug);
145 }
146 
channel_free(channel * c)147 void channel_free(channel *c)
148 {
149   g_hash_table_destroy(c->downloaded_enclosures);
150   g_free(c->spool_directory);
151   g_free(c->channel_filename);
152   g_free(c->url);
153   g_free(c->filename_pattern);
154   free(c);
155 }
156 
_enclosure_urlget_cb(void * buffer,size_t size,size_t nmemb,void * user_data)157 static size_t _enclosure_urlget_cb(void *buffer, size_t size, size_t nmemb, void *user_data)
158 {
159   FILE *f = (FILE *)user_data;
160 
161   return fwrite(buffer, size, nmemb, f);
162 }
163 
_get_rss(channel * c,void * user_data,channel_callback cb,int debug)164 static rss_file *_get_rss(channel *c, void *user_data, channel_callback cb, int debug)
165 {
166   rss_file *f;
167 
168   if (cb)
169     cb(user_data, CCA_RSS_DOWNLOAD_START, NULL, NULL, NULL);
170 
171   if (!strncmp("http://", c->url, strlen("http://"))
172       || !strncmp("https://", c->url, strlen("https://")))
173     f = rss_open_url(c->url, debug);
174   else
175     f = rss_open_file(c->url);
176 
177   if (cb)
178     cb(user_data, CCA_RSS_DOWNLOAD_END, &(f->channel_info), NULL, NULL);
179 
180   return f;
181 }
182 
_do_download(channel * c,channel_info * channel_info,rss_item * item,void * user_data,channel_callback cb,int resume,int debug,int show_progress_bar)183 static int _do_download(channel *c, channel_info *channel_info, rss_item *item,
184                         void *user_data, channel_callback cb, int resume,
185                         int debug, int show_progress_bar)
186 {
187   int download_failed;
188   long resume_from = 0;
189   gchar *enclosure_full_filename;
190   FILE *enclosure_file;
191   struct stat fileinfo;
192   progress_bar *pb;
193 
194   /* Check that the spool directory exists. */
195   if (!g_file_test(c->spool_directory, G_FILE_TEST_IS_DIR)) {
196     g_fprintf(stderr, "Spool directory %s not found.\n", c->spool_directory);
197     return 1;
198   }
199 
200   /* Build enclosure filename. */
201   enclosure_full_filename = build_enclosure_filename(c->spool_directory,
202     c->filename_pattern, channel_info, item);
203 
204   if (g_file_test(enclosure_full_filename, G_FILE_TEST_EXISTS)) {
205     /* A file with the same filename already exists. If the user has asked us
206        to resume downloads, we should append to the file. Otherwise we should
207        refuse to continue. If the feed uses the same filename for each
208        enclosure, running in append mode will corrupt existing files. There is
209        probably no practical way to avoid this, and the issue is documented in
210        castget(1) and castgetrc(5). */
211     if (resume) {
212       /* Set resume offset to the size of the file as it is now (and use
213          non-append mode if the size is zero or stat() fails). */
214       if (0 == stat(enclosure_full_filename, &fileinfo))
215         resume_from = fileinfo.st_size;
216       else
217         resume_from = 0;
218     } else {
219       /* File exists but user does not allow us to append so we have to abort. */
220       g_fprintf(stderr, "Enclosure file %s already exists.\n", enclosure_full_filename);
221       g_free(enclosure_full_filename);
222       return 1;
223     }
224   } else
225     /* By letting the offset be 0 we will write in non-append mode. */
226     resume_from = 0;
227 
228   enclosure_file = fopen(enclosure_full_filename, resume_from ? "ab" : "wb");
229 
230   if (!enclosure_file) {
231     g_fprintf(stderr, "Error opening enclosure file %s.\n", enclosure_full_filename);
232     g_free(enclosure_full_filename);
233     return 1;
234   }
235 
236   if (cb)
237     cb(user_data, CCA_ENCLOSURE_DOWNLOAD_START, channel_info, item->enclosure, enclosure_full_filename);
238 
239   if (show_progress_bar)
240     pb = progress_bar_new(resume_from);
241   else
242     pb = NULL;
243 
244   if (urlget_buffer(item->enclosure->url, enclosure_file, _enclosure_urlget_cb, resume_from, debug, pb)) {
245     g_fprintf(stderr, "Error downloading enclosure from %s.\n", item->enclosure->url);
246 
247     download_failed = 1;
248   } else
249     download_failed = 0;
250 
251   if (pb)
252     progress_bar_free(pb);
253 
254   fclose(enclosure_file);
255 
256   if (cb)
257     cb(user_data, CCA_ENCLOSURE_DOWNLOAD_END, channel_info, item->enclosure, enclosure_full_filename);
258 
259   g_free(enclosure_full_filename);
260 
261   return download_failed;
262 }
263 
_do_catchup(channel * c,channel_info * channel_info,rss_item * item,void * user_data,channel_callback cb)264 static int _do_catchup(channel *c, channel_info *channel_info, rss_item *item,
265                        void *user_data, channel_callback cb)
266 {
267   if (cb) {
268     cb(user_data, CCA_ENCLOSURE_DOWNLOAD_START, channel_info, item->enclosure, NULL);
269 
270     cb(user_data, CCA_ENCLOSURE_DOWNLOAD_END, channel_info, item->enclosure, NULL);
271   }
272 
273   return 0;
274 }
275 
channel_update(channel * c,void * user_data,channel_callback cb,int no_download,int no_mark_read,int first_only,int resume,enclosure_filter * filter,int debug,int show_progress_bar)276 int channel_update(channel *c, void *user_data, channel_callback cb,
277                    int no_download, int no_mark_read, int first_only,
278                    int resume, enclosure_filter *filter, int debug,
279                    int show_progress_bar)
280 {
281   int i, download_failed;
282   rss_file *f;
283 
284   /* Retrieve the RSS file. */
285   f = _get_rss(c, user_data, cb, debug);
286 
287   if (!f)
288     return 1;
289 
290   /* Check enclosures in RSS file. */
291   for (i = 0; i < f->num_items; i++)
292     if (f->items[i]->enclosure) {
293       if (!g_hash_table_lookup_extended(c->downloaded_enclosures, f->items[i]->enclosure->url, NULL, NULL)) {
294         rss_item *item;
295 
296         item = f->items[i];
297 
298         if (!filter || _enclosure_pattern_match(filter, item->enclosure)) {
299           if (no_download)
300             download_failed = _do_catchup(c, &(f->channel_info), item, user_data, cb);
301           else
302             download_failed = _do_download(c, &(f->channel_info), item, user_data, cb, resume, debug, show_progress_bar);
303 
304           if (download_failed)
305             break;
306 
307           if (!no_mark_read) {
308             /* Mark enclosure as downloaded and immediately save channel
309                file to ensure that it reflects the change. */
310             g_hash_table_insert(c->downloaded_enclosures, f->items[i]->enclosure->url,
311                                 (gpointer)get_rfc822_time());
312 
313             _cast_channel_save(c, debug);
314           }
315 
316           /* If we have been instructed to deal only with the first
317              available enclosure, it is time to break out of the loop. */
318           if (first_only)
319             break;
320         }
321       }
322     }
323 
324   if (!no_mark_read) {
325     /* Update the RSS last fetched time and save the channel file again. */
326 
327     if (c->rss_last_fetched)
328       g_free(c->rss_last_fetched);
329 
330     c->rss_last_fetched = g_strdup(f->fetched_time);
331 
332     _cast_channel_save(c, debug);
333   }
334 
335   rss_close(f);
336 
337   return 0;
338 }
339 
340 /* Match the (file) name of an enclosure against a regexp. Letters
341    in the pattern match both upper and lower case letters if
342    'caseless' is TRUE. Returns TRUE if the pattern matches, FALSE
343    otherwise. */
_enclosure_pattern_match(enclosure_filter * filter,const enclosure * enclosure)344 static gboolean _enclosure_pattern_match(enclosure_filter *filter,
345                                          const enclosure *enclosure)
346 {
347   GError *error = NULL;
348   GRegexCompileFlags compile_options = 0;
349   GRegexMatchFlags match_options = 0;
350   GRegex *regex;
351   gboolean match;
352 
353   g_assert(filter);
354   g_assert(filter->pattern);
355   g_assert(enclosure);
356 
357   if (filter->caseless)
358     compile_options |= G_REGEX_CASELESS;
359 
360   regex = g_regex_new(filter->pattern, compile_options, match_options,
361                       &error);
362 
363   if (error) {
364     fprintf(stderr, "Error compiling regular expression %s: %s\n",
365             filter->pattern, error->message);
366     g_error_free(error);
367     return FALSE;
368   }
369 
370   match = g_regex_match(regex, enclosure->url, match_options, NULL);
371 
372   g_regex_unref(regex);
373 
374   return match;
375 }
376 
enclosure_filter_new(const gchar * pattern,gboolean caseless)377 enclosure_filter *enclosure_filter_new(const gchar *pattern,
378                                        gboolean caseless)
379 {
380   enclosure_filter *e = g_malloc(sizeof(struct _enclosure_filter));
381 
382   g_assert(pattern);
383 
384   e->pattern = g_strdup(pattern);
385   e->caseless = caseless;
386 
387   return e;
388 }
389 
enclosure_filter_free(enclosure_filter * e)390 void enclosure_filter_free(enclosure_filter *e)
391 {
392   g_free(e->pattern);
393   g_free(e);
394 }
395