1 /*
2  *  This file is part of rmlint.
3  *
4  *  rmlint is free software: you can redistribute it and/or modify
5  *  it under the terms of the GNU General Public License as published by
6  *  the Free Software Foundation, either version 3 of the License, or
7  *  (at your option) any later version.
8  *
9  *  rmlint is distributed in the hope that it will be useful,
10  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
11  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
12  *  GNU General Public License for more details.
13  *
14  *  You should have received a copy of the GNU General Public License
15  *  along with rmlint.  If not, see <http://www.gnu.org/licenses/>.
16  *
17  * Authors:
18  *
19  *  - Christopher <sahib> Pahl 2010-2020 (https://github.com/sahib)
20  *  - Daniel <SeeSpotRun> T.   2014-2020 (https://github.com/SeeSpotRun)
21  *
22  * Hosted on http://github.com/sahib/rmlint
23  *
24  */
25 
26 #include "../checksums/murmur3.h"
27 #include "../formats.h"
28 #include "../preprocess.h"
29 #include "../utilities.h"
30 #include "../treemerge.h"
31 
32 #include <glib.h>
33 #include <stdio.h>
34 #include <string.h>
35 
36 typedef struct RmFmtHandlerJSON {
37     /* must be first */
38     RmFmtHandler parent;
39 
40     /* More human readable output? */
41     bool pretty;
42 
43     /* set of already existing ids */
44     GHashTable *id_set;
45 } RmFmtHandlerJSON;
46 
47 //////////////////////////////////////////
48 //          FILE ID GENERATOR           //
49 //////////////////////////////////////////
50 
rm_fmt_json_generate_id(RmFmtHandlerJSON * self,RmFile * file,const char * file_path,char * cksum)51 static guint32 rm_fmt_json_generate_id(RmFmtHandlerJSON *self, RmFile *file,
52                                        const char *file_path, char *cksum) {
53     guint32 hash = 0;
54     hash = file->inode ^ file->dev;
55     hash ^= file->actual_file_size;
56 
57     for(int i = 0; i < 8192; ++i) {
58         hash ^= MurmurHash3_x86_32(file_path, strlen(file_path), i);
59         if(cksum != NULL) {
60             hash ^= MurmurHash3_x86_32(cksum, strlen(cksum), i);
61         }
62 
63         if(!g_hash_table_contains(self->id_set, GUINT_TO_POINTER(hash))) {
64             break;
65         }
66     }
67 
68     g_hash_table_add(self->id_set, GUINT_TO_POINTER(hash));
69     return hash;
70 }
71 
72 //////////////////////////////////////////
73 //  POOR MAN'S JSON FORMATTING TOOLBOX  //
74 //////////////////////////////////////////
75 
rm_fmt_json_key(FILE * out,const char * key,const char * value)76 static void rm_fmt_json_key(FILE *out, const char *key, const char *value) {
77     fprintf(out, "\"%s\": \"%s\"", key, value);
78 }
79 
rm_fmt_json_key_bool(FILE * out,const char * key,bool value)80 static void rm_fmt_json_key_bool(FILE *out, const char *key, bool value) {
81     fprintf(out, "\"%s\": %s", key, value ? "true" : "false");
82 }
83 
rm_fmt_json_key_int(FILE * out,const char * key,RmOff value)84 static void rm_fmt_json_key_int(FILE *out, const char *key, RmOff value) {
85     fprintf(out, "\"%s\": %" LLU "", key, value);
86 }
87 
rm_fmt_json_key_float(FILE * out,const char * key,gdouble value)88 static void rm_fmt_json_key_float(FILE *out, const char *key, gdouble value) {
89     // Make sure that the floating point number gets printed with a '.',
90     // not with a comma as usual in e.g. the german language.
91     gchar buf[G_ASCII_DTOSTR_BUF_SIZE];
92     fprintf(out, "\"%s\": %s", key, g_ascii_dtostr(buf, sizeof(buf) - 1, value));
93 }
94 
rm_fmt_json_fix(const char * string,char * fixed,size_t fixed_len)95 static bool rm_fmt_json_fix(const char *string, char *fixed, size_t fixed_len) {
96     /* More information here:
97      *
98      * http://stackoverflow.com/questions/4901133/json-and-escaping-characters/4908960#4908960
99      */
100 
101     int n = strlen(string);
102     char *safe_iter = fixed;
103 
104     for(int i = 0; i < n && (size_t)(safe_iter - fixed) < fixed_len; ++i) {
105         unsigned char *curr = (unsigned char *)&string[i];
106 
107         char text[20];
108         memset(text, 0, sizeof(text));
109 
110         if(*curr == '"' || *curr == '\\') {
111             /* Printable, but needs to be escaped */
112             text[0] = '\\';
113             text[1] = *curr;
114         } else if((*curr > 0 && *curr < 0x1f) || *curr == 0x7f) {
115             /* Something unprintable */
116             switch(*curr) {
117             case '\b':
118                 g_snprintf(text, sizeof(text), "\\b");
119                 break;
120             case '\f':
121                 g_snprintf(text, sizeof(text), "\\f");
122                 break;
123             case '\n':
124                 g_snprintf(text, sizeof(text), "\\n");
125                 break;
126             case '\r':
127                 g_snprintf(text, sizeof(text), "\\r");
128                 break;
129             case '\t':
130                 g_snprintf(text, sizeof(text), "\\t");
131                 break;
132             default:
133                 g_snprintf(text, sizeof(text), "\\u00%02x", (guint)*curr);
134                 break;
135             }
136         } else {
137             /* Take it unmodified */
138             text[0] = *curr;
139         }
140 
141         safe_iter = g_stpcpy(safe_iter, text);
142     }
143 
144     return (size_t)(safe_iter - fixed) < fixed_len;
145 }
146 
rm_fmt_json_key_unsafe(FILE * out,const char * key,const char * value)147 static void rm_fmt_json_key_unsafe(FILE *out, const char *key, const char *value) {
148     char safe_value[PATH_MAX + 4 + 1];
149     memset(safe_value, 0, sizeof(safe_value));
150 
151     if(rm_fmt_json_fix(value, safe_value, sizeof(safe_value))) {
152         fprintf(out, "\"%s\": \"%s\"", key, safe_value);
153     } else {
154         /* This should never happen but give at least means of debugging */
155         fprintf(out, "\"%s\": \"<BROKEN PATH>\"", key);
156     }
157 }
158 
rm_fmt_json_open(RmFmtHandlerJSON * self,FILE * out)159 static void rm_fmt_json_open(RmFmtHandlerJSON *self, FILE *out) {
160     fprintf(out, "{%s", self->pretty ? "\n  " : "");
161 }
162 
rm_fmt_json_close(RmFmtHandlerJSON * self,FILE * out)163 static void rm_fmt_json_close(RmFmtHandlerJSON *self, FILE *out) {
164     if(self->pretty) {
165         fprintf(out, "\n}, ");
166     } else {
167         fprintf(out, "},\n");
168     }
169 }
170 
rm_fmt_json_sep(RmFmtHandlerJSON * self,FILE * out)171 static void rm_fmt_json_sep(RmFmtHandlerJSON *self, FILE *out) {
172     fprintf(out, ",%s", self->pretty ? "\n  " : "");
173 }
174 
175 /////////////////////////
176 //  ACTUAL CALLBACKS   //
177 /////////////////////////
178 
rm_fmt_head(RmSession * session,_UNUSED RmFmtHandler * parent,FILE * out)179 static void rm_fmt_head(RmSession *session, _UNUSED RmFmtHandler *parent, FILE *out) {
180     fprintf(out, "[\n");
181 
182     RmFmtHandlerJSON *self = (RmFmtHandlerJSON *)parent;
183     self->id_set = g_hash_table_new(NULL, NULL);
184 
185     if(rm_fmt_get_config_value(session->formats, "json", "oneline")) {
186         self->pretty = false;
187     }
188 
189     if(!rm_fmt_get_config_value(session->formats, "json", "no_header")) {
190         rm_fmt_json_open(self, out);
191         {
192             rm_fmt_json_key(out, "description", "rmlint json-dump of lint files");
193             rm_fmt_json_sep(self, out);
194             rm_fmt_json_key(out, "cwd", session->cfg->iwd);
195             rm_fmt_json_sep(self, out);
196             rm_fmt_json_key(out, "args", session->cfg->joined_argv);
197             rm_fmt_json_sep(self, out);
198             rm_fmt_json_key(out, "version", RM_VERSION);
199             rm_fmt_json_sep(self, out);
200             rm_fmt_json_key(out, "rev", RM_VERSION_GIT_REVISION);
201             rm_fmt_json_sep(self, out);
202             rm_fmt_json_key_int(out, "progress", 0); /* Header is always first. */
203             rm_fmt_json_sep(self, out);
204             rm_fmt_json_key(out, "checksum_type",
205                             rm_digest_type_to_string(session->cfg->checksum_type));
206             if(session->hash_seed) {
207                 rm_fmt_json_sep(self, out);
208                 rm_fmt_json_key_int(out, "hash_seed", session->hash_seed);
209             }
210 
211             rm_fmt_json_sep(self, out);
212             rm_fmt_json_key_bool(out, "merge_directories", session->cfg->merge_directories);
213         }
214         rm_fmt_json_close(self, out);
215     }
216 }
217 
rm_fmt_foot(_UNUSED RmSession * session,RmFmtHandler * parent,FILE * out)218 static void rm_fmt_foot(_UNUSED RmSession *session, RmFmtHandler *parent, FILE *out) {
219     RmFmtHandlerJSON *self = (RmFmtHandlerJSON *)parent;
220 
221     if(rm_fmt_get_config_value(session->formats, "json", "no_footer")) {
222         fprintf(out, "{}");
223     } else {
224         rm_fmt_json_open(self, out);
225         {
226             rm_fmt_json_key_bool(out, "aborted", rm_session_was_aborted());
227             rm_fmt_json_sep(self, out);
228             rm_fmt_json_key_int(out, "progress", 100); /* Footer is always last. */
229             rm_fmt_json_sep(self, out);
230             rm_fmt_json_key_int(out, "total_files", session->total_files);
231             rm_fmt_json_sep(self, out);
232             rm_fmt_json_key_int(out, "ignored_files", session->ignored_files);
233             rm_fmt_json_sep(self, out);
234             rm_fmt_json_key_int(out, "ignored_folders", session->ignored_folders);
235             rm_fmt_json_sep(self, out);
236             rm_fmt_json_key_int(out, "duplicates", session->dup_counter);
237             rm_fmt_json_sep(self, out);
238             rm_fmt_json_key_int(out, "duplicate_sets", session->dup_group_counter);
239             rm_fmt_json_sep(self, out);
240             rm_fmt_json_key_int(out, "total_lint_size", session->total_lint_size);
241         }
242         if(self->pretty) {
243             fprintf(out, "\n}");
244         } else {
245             fprintf(out, "}\n");
246         }
247     }
248 
249     fprintf(out, "]\n");
250     g_hash_table_unref(self->id_set);
251 }
252 
rm_fmt_json_cksum(RmFile * file,char * checksum_str,size_t size)253 static void rm_fmt_json_cksum(RmFile *file, char *checksum_str, size_t size) {
254     memset(checksum_str, '0', size);
255     checksum_str[size - 1] = 0;
256     rm_digest_hexstring(file->digest, checksum_str);
257 }
258 
rm_fmt_elem(RmSession * session,_UNUSED RmFmtHandler * parent,FILE * out,RmFile * file)259 static void rm_fmt_elem(RmSession *session, _UNUSED RmFmtHandler *parent, FILE *out, RmFile *file) {
260     if(rm_fmt_get_config_value(session->formats, "json", "no_body")) {
261         return;
262     }
263 
264     if(file->lint_type == RM_LINT_TYPE_UNIQUE_FILE) {
265         if(!rm_fmt_get_config_value(session->formats, "json", "unique")) {
266             if(!file->digest || !session->cfg->write_unfinished) {
267                 return;
268             }
269         }
270 
271         if(session->cfg->keep_all_tagged && !file->is_prefd) {
272             /* don't list 'untagged' files as unique */
273             file->is_original = false;
274         } else if(session->cfg->keep_all_untagged && file->is_prefd) {
275             /* don't list 'tagged' files as unique */
276             file->is_original = false;
277         } else {
278             file->is_original = true;
279         }
280     }
281 
282     char *checksum_str = NULL;
283     size_t checksum_size = 0;
284 
285     if(file->digest != NULL) {
286         checksum_size = rm_digest_get_bytes(file->digest) * 2 + 1;
287         checksum_str = g_slice_alloc0(checksum_size);
288         rm_fmt_json_cksum(file, checksum_str, checksum_size);
289         checksum_str[checksum_size - 1] = 0;
290     }
291 
292     RmFmtHandlerJSON *self = (RmFmtHandlerJSON *)parent;
293 
294     /* Make it look like a json element */
295     rm_fmt_json_open(self, out);
296     {
297         RM_DEFINE_PATH(file);
298 
299         rm_fmt_json_key_int(out, "id",
300                             rm_fmt_json_generate_id(self, file, file_path, checksum_str));
301         rm_fmt_json_sep(self, out);
302         rm_fmt_json_key(out, "type", rm_file_lint_type_to_string(file->lint_type));
303         rm_fmt_json_sep(self, out);
304 
305         gdouble progress = 0;
306         if(session->shred_bytes_after_preprocess) {
307             progress = CLAMP(
308                 100 - 100 * (
309                     (gdouble)session->shred_bytes_remaining /
310                     (gdouble)session->shred_bytes_after_preprocess
311                 ),
312                 0,
313                 100
314             );
315         }
316         rm_fmt_json_key_int(out, "progress", progress);
317         rm_fmt_json_sep(self, out);
318 
319         if(file->digest) {
320             rm_fmt_json_key(out, "checksum", checksum_str);
321             rm_fmt_json_sep(self, out);
322         }
323 
324         rm_fmt_json_key_unsafe(out, "path", file_path);
325         rm_fmt_json_sep(self, out);
326         rm_fmt_json_key_int(out, "size", file->actual_file_size);
327         rm_fmt_json_sep(self, out);
328         rm_fmt_json_key_int(out, "depth", file->depth);
329         rm_fmt_json_sep(self, out);
330         rm_fmt_json_key_int(out, "inode", file->inode);
331         rm_fmt_json_sep(self, out);
332         rm_fmt_json_key_int(out, "disk_id", file->dev);
333         rm_fmt_json_sep(self, out);
334         rm_fmt_json_key_bool(out, "is_original", file->is_original);
335         rm_fmt_json_sep(self, out);
336 
337         if(file->lint_type == RM_LINT_TYPE_DUPE_DIR_CANDIDATE) {
338             rm_fmt_json_key_int(out, "n_children", file->n_children);
339             rm_fmt_json_sep(self, out);
340         }
341 
342         if(file->lint_type != RM_LINT_TYPE_UNIQUE_FILE) {
343             if(file->twin_count >= 0) {
344                 rm_fmt_json_key_int(out, "twins", file->twin_count);
345                 rm_fmt_json_sep(self, out);
346             }
347 
348 
349 			if(file->lint_type == RM_LINT_TYPE_PART_OF_DIRECTORY && file->parent_dir) {
350 				rm_fmt_json_key_unsafe(out, "parent_path", rm_directory_get_dirname(file->parent_dir));
351 				rm_fmt_json_sep(self, out);
352 
353 			}
354 
355             if(session->cfg->find_hardlinked_dupes) {
356                 RmFile *hardlink_head = RM_FILE_HARDLINK_HEAD(file);
357 
358                 if(hardlink_head && hardlink_head != file && file->digest) {
359                     char orig_checksum_str[rm_digest_get_bytes(file->digest) * 2 + 1];
360                     rm_fmt_json_cksum(hardlink_head, orig_checksum_str,
361                                       sizeof(orig_checksum_str));
362 
363                     RM_DEFINE_PATH(hardlink_head);
364 
365                     guint32 orig_id = rm_fmt_json_generate_id(
366                         self, hardlink_head, hardlink_head_path, orig_checksum_str);
367 
368                     rm_fmt_json_key_int(out, "hardlink_of", orig_id);
369                     rm_fmt_json_sep(self, out);
370                 }
371             }
372         }
373 
374         rm_fmt_json_key_float(out, "mtime", file->mtime);
375     }
376     rm_fmt_json_close(self, out);
377 
378     if(checksum_str != NULL) {
379         g_slice_free1(checksum_size, checksum_str);
380     }
381 }
382 
383 static RmFmtHandlerJSON JSON_HANDLER_IMPL = {
384     /* Initialize parent */
385     .parent =
386         {
387             .size = sizeof(JSON_HANDLER_IMPL),
388             .name = "json",
389             .head = rm_fmt_head,
390             .elem = rm_fmt_elem,
391             .prog = NULL,
392             .foot = rm_fmt_foot,
393             .valid_keys = {"no_header", "no_footer", "no_body", "oneline", "unique", NULL},
394         },
395     .pretty = true};
396 
397 RmFmtHandler *JSON_HANDLER = (RmFmtHandler *)&JSON_HANDLER_IMPL;
398