1 /* ncdu - NCurses Disk Usage
2 
3   Copyright (c) 2007-2021 Yoran Heling
4 
5   Permission is hereby granted, free of charge, to any person obtaining
6   a copy of this software and associated documentation files (the
7   "Software"), to deal in the Software without restriction, including
8   without limitation the rights to use, copy, modify, merge, publish,
9   distribute, sublicense, and/or sell copies of the Software, and to
10   permit persons to whom the Software is furnished to do so, subject to
11   the following conditions:
12 
13   The above copyright notice and this permission notice shall be included
14   in all copies or substantial portions of the Software.
15 
16   THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
17   EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
18   MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
19   IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY
20   CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT,
21   TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE
22   SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
23 
24 */
25 
26 /* This JSON parser has the following limitations:
27  * - No support for character encodings incompatible with ASCII (e.g.
28  *   UTF-16/32)
29  * - Doesn't validate UTF-8 correctness (in fact, besides the ASCII part this
30  *   parser doesn't know anything about encoding).
31  * - Doesn't validate that there are no duplicate keys in JSON objects.
32  * - Isn't very strict with validating non-integer numbers.
33  */
34 
35 #include "global.h"
36 
37 #include <stdlib.h>
38 #include <stdio.h>
39 #include <string.h>
40 #include <errno.h>
41 #include <limits.h>
42 
43 
44 /* Max. length of any JSON string we're interested in. A string may of course
45  * be larger, we're not going to read more than MAX_VAL in memory. If a string
46  * we're interested in (e.g. a file name) is longer than this, reading the
47  * import will results in an error. */
48 #define MAX_VAL (32*1024)
49 
50 /* Minimum number of bytes we request from fread() */
51 #define MIN_READ_SIZE 1024
52 
53 /* Read buffer size. Must be at least 2*MIN_READ_SIZE, everything larger
54  * improves performance. */
55 #define READ_BUF_SIZE (32*1024)
56 
57 
58 int dir_import_active = 0;
59 
60 
61 /* Use a struct for easy batch-allocation and deallocation of state data. */
62 static struct ctx {
63   FILE *stream;
64 
65   int line;
66   int byte;
67   int eof;
68   int items;
69   char *buf; /* points into readbuf, always zero-terminated. */
70   char *lastfill; /* points into readbuf, location of the zero terminator. */
71 
72   /* scratch space */
73   struct dir    *buf_dir;
74   struct dir_ext buf_ext[1];
75   unsigned int nlink;
76 
77   char buf_name[MAX_VAL];
78   char val[MAX_VAL];
79   char readbuf[READ_BUF_SIZE];
80 } *ctx;
81 
82 
83 /* Fills readbuf with data from the stream. *buf will have at least n (<
84  * READ_BUF_SIZE) bytes available, unless the stream reached EOF or an error
85  * occurred. If the file data contains a null-type, this is considered an error.
86  * Returns 0 on success, non-zero on error. */
fill(int n)87 static int fill(int n) {
88   int r;
89 
90   if(ctx->eof)
91     return 0;
92 
93   r = READ_BUF_SIZE-(ctx->lastfill - ctx->readbuf); /* number of bytes left in the buffer */
94   if(n < r)
95     n = r-1;
96   if(n < MIN_READ_SIZE) {
97     r = ctx->lastfill - ctx->buf; /* number of unread bytes left in the buffer */
98     memcpy(ctx->readbuf, ctx->buf, r);
99     ctx->lastfill = ctx->readbuf + r;
100     ctx->buf = ctx->readbuf;
101     n = READ_BUF_SIZE-r-1;
102   }
103 
104   do {
105     r = fread(ctx->lastfill, 1, n, ctx->stream);
106     if(r != n) {
107       if(feof(ctx->stream))
108         ctx->eof = 1;
109       else if(ferror(ctx->stream) && errno != EINTR) {
110         dir_seterr("Read error: %s", strerror(errno));
111         return 1;
112       }
113     }
114 
115     ctx->lastfill[r] = 0;
116     if(strlen(ctx->lastfill) != (size_t)r) {
117       dir_seterr("Zero-byte found in JSON stream");
118       return 1;
119     }
120     ctx->lastfill += r;
121     n -= r;
122   } while(!ctx->eof && n > MIN_READ_SIZE);
123 
124   return 0;
125 }
126 
127 
128 /* Two macros that break function calling behaviour, but are damn convenient */
129 #define E(_x, _m) do {\
130     if(_x) {\
131       if(!dir_fatalerr)\
132         dir_seterr("Line %d byte %d: %s", ctx->line, ctx->byte, _m);\
133       return 1;\
134     }\
135   } while(0)
136 
137 #define C(_x) do {\
138     if(_x)\
139       return 1;\
140   } while(0)
141 
142 
143 /* Require at least n bytes in the buffer, throw an error on early EOF.
144  * (Macro to quickly handle the common case) */
145 #define rfill1 (!*ctx->buf && _rfill(1))
146 #define rfill(_n) ((ctx->lastfill - ctx->buf < (_n)) && _rfill(_n))
147 
_rfill(int n)148 static int _rfill(int n) {
149   C(fill(n));
150   E(ctx->lastfill - ctx->buf < n, "Unexpected EOF");
151   return 0;
152 }
153 
154 
155 /* Consumes n bytes from the buffer. */
con(int n)156 static inline void con(int n) {
157   ctx->buf += n;
158   ctx->byte += n;
159 }
160 
161 
162 /* Consumes any whitespace. If *ctx->buf == 0 after this function, we've reached EOF. */
cons(void)163 static int cons(void) {
164   while(1) {
165     C(!*ctx->buf && fill(1));
166 
167     switch(*ctx->buf) {
168     case 0x0A:
169       /* Special-case the newline-character with respect to consuming stuff
170        * from the buffer. This is the only function which *can* consume the
171        * newline character, so it's more efficient to handle it in here rather
172        * than in the more general con(). */
173       ctx->buf++;
174       ctx->line++;
175       ctx->byte = 0;
176       break;
177     case 0x20:
178     case 0x09:
179     case 0x0D:
180       con(1);
181       break;
182     default:
183       return 0;
184     }
185   }
186 }
187 
188 
rstring_esc(char ** dest,int * destlen)189 static int rstring_esc(char **dest, int *destlen) {
190   unsigned int n;
191 
192   C(rfill1);
193 
194 #define ap(c) if(*destlen > 1) { *((*dest)++) = c; (*destlen)--; }
195   switch(*ctx->buf) {
196   case '"':  ap('"');  con(1); break;
197   case '\\': ap('\\'); con(1); break;
198   case '/':  ap('/');  con(1); break;
199   case 'b':  ap(0x08); con(1); break;
200   case 'f':  ap(0x0C); con(1); break;
201   case 'n':  ap(0x0A); con(1); break;
202   case 'r':  ap(0x0D); con(1); break;
203   case 't':  ap(0x09); con(1); break;
204   case 'u':
205     C(rfill(5));
206 #define hn(n) (n >= '0' && n <= '9' ? n-'0' : n >= 'A' && n <= 'F' ? n-'A'+10 : n >= 'a' && n <= 'f' ? n-'a'+10 : 1<<16)
207     n = (hn(ctx->buf[1])<<12) + (hn(ctx->buf[2])<<8) + (hn(ctx->buf[3])<<4) + hn(ctx->buf[4]);
208 #undef hn
209     if(n <= 0x007F) {
210       ap(n);
211     } else if(n <= 0x07FF) {
212       ap(0xC0 | (n>>6));
213       ap(0x80 | (n & 0x3F));
214     } else if(n <= 0xFFFF) {
215       ap(0xE0 | (n>>12));
216       ap(0x80 | ((n>>6) & 0x3F));
217       ap(0x80 | (n & 0x3F));
218     } else /* this happens if there was an invalid character (n >= (1<<16)) */
219       E(1, "Invalid character in \\u escape");
220     con(5);
221     break;
222   default:
223     E(1, "Invalid escape sequence");
224   }
225 #undef ap
226   return 0;
227 }
228 
229 
230 /* Parse a JSON string and write it to *dest (max. destlen). Consumes but
231  * otherwise ignores any characters if the string is longer than destlen. *dest
232  * will be null-terminated, dest[destlen-1] = 0 if the string was cut just long
233  * enough of was cut off. That byte will be left untouched if the string is
234  * small enough. */
rstring(char * dest,int destlen)235 static int rstring(char *dest, int destlen) {
236   C(rfill1);
237   E(*ctx->buf != '"', "Expected string");
238   con(1);
239 
240   while(1) {
241     C(rfill1);
242     if(*ctx->buf == '"')
243       break;
244     if(*ctx->buf == '\\') {
245       con(1);
246       C(rstring_esc(&dest, &destlen));
247       continue;
248     }
249     E((unsigned char)*ctx->buf <= 0x1F || (unsigned char)*ctx->buf == 0x7F, "Invalid character");
250     if(destlen > 1) {
251       *(dest++) = *ctx->buf;
252       destlen--;
253     }
254     con(1);
255   }
256   con(1);
257   if(destlen > 0)
258     *dest = 0;
259   return 0;
260 }
261 
262 
263 /* Parse and consume a JSON integer. Throws an error if the value does not fit
264  * in an uint64_t, is not an integer or is larger than 'max'. */
rint64(uint64_t * val,uint64_t max)265 static int rint64(uint64_t *val, uint64_t max) {
266   uint64_t v;
267   int haschar = 0;
268   *val = 0;
269   while(1) {
270     C(!*ctx->buf && fill(1));
271     if(*ctx->buf == '0' && !haschar) {
272       con(1);
273       break;
274     }
275     if(*ctx->buf >= '0' && *ctx->buf <= '9') {
276       haschar = 1;
277       v = (*val)*10 + (*ctx->buf-'0');
278       E(v < *val, "Invalid (positive) integer");
279       *val = v;
280       con(1);
281       continue;
282     }
283     E(!haschar, "Invalid (positive) integer");
284     break;
285   }
286   E(*val > max, "Integer out of range");
287   return 0;
288 }
289 
290 
291 /* Parse and consume a JSON number. The result is discarded.
292  * TODO: Improve validation. */
rnum(void)293 static int rnum(void) {
294   int haschar = 0;
295   C(rfill1);
296   while(1) {
297     C(!*ctx->buf && fill(1));
298     if(*ctx->buf == 'e' || *ctx->buf == 'E' || *ctx->buf == '-' || *ctx->buf == '+' || *ctx->buf == '.' || (*ctx->buf >= '0' && *ctx->buf <= '9')) {
299       haschar = 1;
300       con(1);
301     } else {
302       E(!haschar, "Invalid JSON value");
303       break;
304     }
305   }
306   return 0;
307 }
308 
309 
rlit(const char * v,int len)310 static int rlit(const char *v, int len) {
311   C(rfill(len));
312   E(strncmp(ctx->buf, v, len) != 0, "Invalid JSON value");
313   con(len);
314   return 0;
315 }
316 
317 
318 /* Parse the "<space> <string> <space> : <space>" part of an object key. */
rkey(char * dest,int destlen)319 static int rkey(char *dest, int destlen) {
320   C(cons() || rstring(dest, destlen) || cons());
321   E(*ctx->buf != ':', "Expected ':'");
322   con(1);
323   return cons();
324 }
325 
326 
327 /* (Recursively) parse and consume any JSON value. The result is discarded. */
rval(void)328 static int rval(void) {
329   C(rfill1);
330   switch(*ctx->buf) {
331   case 't': /* true */
332     C(rlit("true", 4));
333     break;
334   case 'f': /* false */
335     C(rlit("false", 5));
336     break;
337   case 'n': /* null */
338     C(rlit("null", 4));
339     break;
340   case '"': /* string */
341     C(rstring(NULL, 0));
342     break;
343   case '{': /* object */
344     con(1);
345     while(1) {
346       C(cons());
347       if(*ctx->buf == '}')
348         break;
349       C(rkey(NULL, 0) || rval() || cons());
350       if(*ctx->buf == '}')
351         break;
352       E(*ctx->buf != ',', "Expected ',' or '}'");
353       con(1);
354     }
355     con(1);
356     break;
357   case '[': /* array */
358     con(1);
359     while(1) {
360       C(cons());
361       if(*ctx->buf == ']')
362         break;
363       C(cons() || rval() || cons());
364       if(*ctx->buf == ']')
365         break;
366       E(*ctx->buf != ',', "Expected ',' or ']'");
367       con(1);
368     }
369     con(1);
370     break;
371   default: /* assume number */
372     C(rnum());
373     break;
374   }
375 
376   return 0;
377 }
378 
379 
380 /* Consumes everything up to the root item, and checks that this item is a dir. */
header(void)381 static int header(void) {
382   uint64_t v;
383 
384   C(cons());
385   E(*ctx->buf != '[', "Expected JSON array");
386   con(1);
387   C(cons() || rint64(&v, 10000) || cons());
388   E(v != 1, "Incompatible major format version");
389   E(*ctx->buf != ',', "Expected ','");
390   con(1);
391   C(cons() || rint64(&v, 10000) || cons()); /* Ignore the minor version for now */
392   E(*ctx->buf != ',', "Expected ','");
393   con(1);
394   /* Metadata block is currently ignored */
395   C(cons() || rval() || cons());
396   E(*ctx->buf != ',', "Expected ','");
397   con(1);
398 
399   C(cons());
400   E(*ctx->buf != '[', "Top-level item must be a directory");
401 
402   return 0;
403 }
404 
405 
406 static int item(uint64_t);
407 
408 /* Read and add dir contents */
itemdir(uint64_t dev)409 static int itemdir(uint64_t dev) {
410   while(1) {
411     C(cons());
412     if(*ctx->buf == ']')
413       break;
414     E(*ctx->buf != ',', "Expected ',' or ']'");
415     con(1);
416     C(cons() || item(dev));
417   }
418   con(1);
419   C(cons());
420   return 0;
421 }
422 
423 
424 /* Reads a JSON object representing a struct dir/dir_ext item. Writes to
425  * ctx->buf_dir, ctx->buf_ext, ctx->buf_name and ctx->nlink. */
iteminfo(void)426 static int iteminfo(void) {
427   uint64_t iv;
428 
429   E(*ctx->buf != '{', "Expected JSON object");
430   con(1);
431 
432   while(1) {
433     C(rkey(ctx->val, MAX_VAL));
434     /* TODO: strcmp() in this fashion isn't very fast. */
435     if(strcmp(ctx->val, "name") == 0) {              /* name */
436       ctx->val[MAX_VAL-1] = 1;
437       C(rstring(ctx->val, MAX_VAL));
438       E(ctx->val[MAX_VAL-1] != 1, "Too large string value");
439       strcpy(ctx->buf_name, ctx->val);
440     } else if(strcmp(ctx->val, "asize") == 0) {      /* asize */
441       C(rint64(&iv, INT64_MAX));
442       ctx->buf_dir->asize = iv;
443     } else if(strcmp(ctx->val, "dsize") == 0) {      /* dsize */
444       C(rint64(&iv, INT64_MAX));
445       ctx->buf_dir->size = iv;
446     } else if(strcmp(ctx->val, "dev") == 0) {        /* dev */
447       C(rint64(&iv, UINT64_MAX));
448       ctx->buf_dir->dev = iv;
449     } else if(strcmp(ctx->val, "ino") == 0) {        /* ino */
450       C(rint64(&iv, UINT64_MAX));
451       ctx->buf_dir->ino = iv;
452     } else if(strcmp(ctx->val, "uid") == 0) {        /* uid */
453       C(rint64(&iv, INT32_MAX));
454       ctx->buf_dir->flags |= FF_EXT;
455       ctx->buf_ext->uid = iv;
456     } else if(strcmp(ctx->val, "gid") == 0) {        /* gid */
457       C(rint64(&iv, INT32_MAX));
458       ctx->buf_dir->flags |= FF_EXT;
459       ctx->buf_ext->gid = iv;
460     } else if(strcmp(ctx->val, "mode") == 0) {       /* mode */
461       C(rint64(&iv, UINT16_MAX));
462       ctx->buf_dir->flags |= FF_EXT;
463       ctx->buf_ext->mode = iv;
464     } else if(strcmp(ctx->val, "mtime") == 0) {      /* mtime */
465       C(rint64(&iv, UINT64_MAX));
466       ctx->buf_dir->flags |= FF_EXT;
467       ctx->buf_ext->mtime = iv;
468       // Accept decimal numbers, but discard the fractional part because our data model doesn't support it.
469       if(*ctx->buf == '.') {
470           con(1);
471           while(*ctx->buf >= '0' && *ctx->buf <= '9')
472               con(1);
473       }
474     } else if(strcmp(ctx->val, "hlnkc") == 0) {      /* hlnkc */
475       if(*ctx->buf == 't') {
476         C(rlit("true", 4));
477         ctx->buf_dir->flags |= FF_HLNKC;
478       } else
479         C(rlit("false", 5));
480     } else if(strcmp(ctx->val, "nlink") == 0) {      /* nlink */
481       C(rint64(&iv, UINT32_MAX));
482       if(iv > 1)
483         ctx->buf_dir->flags |= FF_HLNKC;
484       ctx->nlink = iv;
485     } else if(strcmp(ctx->val, "read_error") == 0) { /* read_error */
486       if(*ctx->buf == 't') {
487         C(rlit("true", 4));
488         ctx->buf_dir->flags |= FF_ERR;
489       } else
490         C(rlit("false", 5));
491     } else if(strcmp(ctx->val, "excluded") == 0) {   /* excluded */
492       C(rstring(ctx->val, 8));
493       if(strcmp(ctx->val, "otherfs") == 0)
494         ctx->buf_dir->flags |= FF_OTHFS;
495       else if(strcmp(ctx->val, "kernfs") == 0)
496         ctx->buf_dir->flags |= FF_KERNFS;
497       else if(strcmp(ctx->val, "frmlnk") == 0)
498         ctx->buf_dir->flags |= FF_FRMLNK;
499       else
500         ctx->buf_dir->flags |= FF_EXL;
501     } else if(strcmp(ctx->val, "notreg") == 0) {     /* notreg */
502       if(*ctx->buf == 't') {
503         C(rlit("true", 4));
504         ctx->buf_dir->flags &= ~FF_FILE;
505       } else
506         C(rlit("false", 5));
507     } else
508       C(rval());
509 
510     C(cons());
511     if(*ctx->buf == '}')
512       break;
513     E(*ctx->buf != ',', "Expected ',' or '}'");
514     con(1);
515   }
516   con(1);
517 
518   E(!*ctx->buf_name, "No name field present in item information object");
519   ctx->items++;
520   /* Only call input_handle() once for every 32 items. Importing items is so
521    * fast that the time spent in input_handle() dominates when called every
522    * time. Don't set this value too high, either, as feedback should still be
523    * somewhat responsive when our import data comes from a slow-ish source. */
524   return !(ctx->items & 31) ? input_handle(1) : 0;
525 }
526 
527 
528 /* Recursively reads a file or directory item */
item(uint64_t dev)529 static int item(uint64_t dev) {
530   int isdir = 0;
531   int isroot = ctx->items == 0;
532 
533   if(*ctx->buf == '[') {
534     isdir = 1;
535     con(1);
536     C(cons());
537   }
538 
539   memset(ctx->buf_dir, 0, offsetof(struct dir, name));
540   memset(ctx->buf_ext, 0, sizeof(struct dir_ext));
541   ctx->nlink = 0;
542   *ctx->buf_name = 0;
543   ctx->buf_dir->flags |= isdir ? FF_DIR : FF_FILE;
544   ctx->buf_dir->dev = dev;
545 
546   C(iteminfo());
547   dev = ctx->buf_dir->dev;
548 
549   if(isroot)
550     dir_curpath_set(ctx->buf_name);
551   else
552     dir_curpath_enter(ctx->buf_name);
553 
554   if(isdir) {
555     if(dir_output.item(ctx->buf_dir, ctx->buf_name, ctx->buf_ext, ctx->nlink)) {
556       dir_seterr("Output error: %s", strerror(errno));
557       return 1;
558     }
559     C(itemdir(dev));
560     if(dir_output.item(NULL, 0, NULL, 0)) {
561       dir_seterr("Output error: %s", strerror(errno));
562       return 1;
563     }
564   } else if(dir_output.item(ctx->buf_dir, ctx->buf_name, ctx->buf_ext, ctx->nlink)) {
565     dir_seterr("Output error: %s", strerror(errno));
566     return 1;
567   }
568 
569   if(!isroot)
570     dir_curpath_leave();
571 
572   return 0;
573 }
574 
575 
footer(void)576 static int footer(void) {
577   while(1) {
578     C(cons());
579     if(*ctx->buf == ']')
580       break;
581     E(*ctx->buf != ',', "Expected ',' or ']'");
582     con(1);
583     C(cons() || rval());
584   }
585   con(1);
586   C(cons());
587   E(*ctx->buf, "Trailing garbage");
588   return 0;
589 }
590 
591 
process(void)592 static int process(void) {
593   int fail = 0;
594 
595   header();
596 
597   if(!dir_fatalerr)
598     fail = item(0);
599 
600   if(!dir_fatalerr && !fail)
601     footer();
602 
603   if(fclose(ctx->stream) && !dir_fatalerr && !fail)
604     dir_seterr("Error closing file: %s", strerror(errno));
605   free(ctx->buf_dir);
606   free(ctx);
607 
608   while(dir_fatalerr && !input_handle(0))
609     ;
610   return dir_output.final(dir_fatalerr || fail);
611 }
612 
613 
dir_import_init(const char * fn)614 int dir_import_init(const char *fn) {
615   FILE *stream;
616   if(strcmp(fn, "-") == 0)
617     stream = stdin;
618   else if((stream = fopen(fn, "r")) == NULL)
619     return 1;
620 
621   ctx = xmalloc(sizeof(struct ctx));
622   ctx->stream = stream;
623   ctx->line = 1;
624   ctx->byte = ctx->eof = ctx->items = 0;
625   ctx->buf = ctx->lastfill = ctx->readbuf;
626   ctx->buf_dir = xmalloc(dir_memsize(""));
627   ctx->readbuf[0] = 0;
628 
629   dir_curpath_set(fn);
630   dir_process = process;
631   dir_import_active = 1;
632   return 0;
633 }
634 
635