1 
2 #include "pkgcache.h"
3 #include "errors.h"
4 #include "winfiles.h"
5 
6 #include <fcntl.h>
7 #include <unistd.h>
8 #include <errno.h>
9 #include <ctype.h>
10 
11 #define STR1(x) STRING_ELT(x, 0)
12 #define HASH_SIZE 256
13 #define MAX_COLL 10
14 
15 #define ERROR_TOO_MANY_COLUMNS 1
16 #define ERROR_HASH_TABLE_FULL  2
17 
hash_string(char * str,int strlen)18 static R_INLINE int hash_string(char *str, int strlen) {
19   int backup = str[strlen];
20   str[strlen] = '\0';
21   unsigned long hash = 5381;
22   int c;
23   while ((c = *str++)) {
24     hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
25   }
26 
27   str--;
28   *str = backup;
29   return hash % HASH_SIZE;
30 }
31 
32 struct hash_table {
33   SEXP nms;
34   SEXP *nmsptr;
35   SEXP cols;
36   int *tab;
37   int tablen;
38   int nfld;
39   int max_cols;
40   int npkgs;
41 };
42 
hash_create(struct hash_table * table,SEXP nms,SEXP cols,SEXP tab,int max_cols,int npkgs)43 static void hash_create(struct hash_table *table, SEXP nms, SEXP cols,
44                         SEXP tab, int max_cols, int npkgs) {
45   table->nms = nms;
46   table->nmsptr = STRING_PTR(nms);
47   table->cols = cols;
48   table->tab = INTEGER(tab);
49   table->tablen = LENGTH(tab);
50   table->nfld = 0;
51   table->max_cols = max_cols;
52   table->npkgs = npkgs;
53   memset(table->tab, 0, sizeof(int) * table->tablen);
54 }
55 
hash_update(struct hash_table * table,char * key,int keylen,int npkg,SEXP val,int err)56 static inline int hash_update(struct hash_table *table, char *key, int keylen,
57                               int npkg, SEXP val, int err) {
58   int len = table->tablen;
59   int *t = table->tab;
60   int hash = hash_string(key, keylen);
61   int start = hash * MAX_COLL;
62 
63   for (; start < len; start++) {
64     int p = t[start];
65     if (p == 0) {
66 
67       if (table->nfld == table->max_cols) {
68         if (err) {                                                   // __NO_COVERAGE__
69           R_THROW_ERROR(                                             // __NO_COVERAGE__
70             "Internal pkgcache error, too many different fields in " // __NO_COVERAGE__
71             "PACKAGES or DESCRIPTION data, please report a bug"      // __NO_COVERAGE__
72           );                                                         // __NO_COVERAGE__
73         }                                                            // __NO_COVERAGE__
74         return ERROR_TOO_MANY_COLUMNS;                               // __NO_COVERAGE__
75       }
76 
77       SET_STRING_ELT(table->nms, table->nfld, Rf_mkCharLenCE(key, keylen, CE_NATIVE));
78       SET_VECTOR_ELT(table->cols, table->nfld, allocVector(STRSXP, table->npkgs));
79       SET_STRING_ELT(VECTOR_ELT(table->cols, table->nfld), npkg, val);
80       table->nfld += 1;
81       t[start] = table->nfld;
82       return 0;
83 
84     } else {
85       p--;
86       if (!strncmp(key, CHAR(table->nmsptr[p]), keylen)) {
87         SET_STRING_ELT(VECTOR_ELT(table->cols, p), npkg, val);
88         return 0;
89       }
90     }
91   }                                                                      // __NO_COVERAGE__
92 
93   if (err) {                                                             // __NO_COVERAGE__
94     R_THROW_ERROR(                                                       // __NO_COVERAGE__
95       "Internal pkgcache error, hash table is full, please report a bug" // __NO_COVERAGE__
96     );                                                                   // __NO_COVERAGE__
97   }                                                                      // __NO_COVERAGE__
98   return ERROR_HASH_TABLE_FULL;                                          // __NO_COVERAGE__
99 }
100 
101 /* --------------------------------------------------------------------- */
102 
pkgcache__read_file_raw(const char * cpath)103 SEXP pkgcache__read_file_raw(const char *cpath) {
104   SEXP result = R_NilValue;
105   int err;
106   int fd = open_file(cpath, O_RDONLY);
107 
108   if (fd == -1) {
109     return(R_FORMAT_SYSTEM_ERROR("Cannot open file `%s`", cpath));
110   }
111 
112   off_t len = lseek(fd, 0, SEEK_END);
113   if (len == -1) {
114     err = errno;                                                       // __NO_COVERAGE__
115     close(fd);                                                         // __NO_COVERAGE__
116     return R_FORMAT_SYSTEM_ERROR_CODE(err, "Cannot seek `%s`", cpath); // __NO_COVERAGE__
117   }
118   off_t len2 = lseek(fd, 0, SEEK_SET);
119   if (len2 == -1) {
120     err = errno;                                                       // __NO_COVERAGE__
121     close(fd);                                                         // __NO_COVERAGE__
122     return R_FORMAT_SYSTEM_ERROR_CODE(err, "Cannot seek `%s`", cpath); // __NO_COVERAGE__
123   }
124 
125   /* TODO: should use cleancall to close the file if allocVector fails */
126 
127   result = PROTECT(allocVector(RAWSXP, len));
128 
129   ssize_t ret = read(fd, RAW(result), len);
130   if (ret == -1) {
131     err = errno;                                                       // __NO_COVERAGE__
132     close(fd);                                                         // __NO_COVERAGE__
133     UNPROTECT(1);                                                      // __NO_COVERAGE__
134     return R_FORMAT_SYSTEM_ERROR_CODE(err, "Cannot read `%s`", cpath); // __NO_COVERAGE__
135   }
136 
137   close(fd);
138 
139   UNPROTECT(1);
140   return result;
141 }
142 
143 /* --------------------------------------------------------------------- */
144 
pkgcache_read_raw(SEXP paths)145 SEXP pkgcache_read_raw(SEXP paths) {
146   R_xlen_t i, len = XLENGTH(paths);
147   SEXP result = PROTECT(allocVector(VECSXP, len));
148 
149   for (i = 0; i < len; i++) {
150     SET_VECTOR_ELT(
151       result,
152       i,
153       pkgcache__read_file_raw(CHAR(STRING_ELT(paths, i)))
154     );
155   }
156 
157   UNPROTECT(1);
158   return result;
159 }
160 
161 /* --------------------------------------------------------------------- */
162 
163 #define S_BG 0                  /* beginning of the file */
164 #define S_KW 1                  /* inside a keyword */
165 #define S_VL 2                  /* inside a value */
166 #define S_NL 3                  /* right after a newline */
167 #define S_WS 4                  /* after newline + space */
168 
pkgcache_parse_description_raw(SEXP raw)169 SEXP pkgcache_parse_description_raw(SEXP raw) {
170   char *p = NULL, *start = (char*) RAW(raw);
171   char *end = start + XLENGTH(raw);
172   int state = S_BG;
173   char *kw = NULL, *vl = NULL;
174   int kwsize = 0, vlsize = 0;
175   int linum = 1;
176 
177   SEXP result = PROTECT(allocVector(STRSXP, 200));
178   SEXP names = PROTECT(allocVector(STRSXP, 200));
179   int ridx = 0;
180 
181   for (p = start; p < end; ) {
182     switch (state) {
183 
184     /* -- at the begining ---------------------------------------------- */
185     case S_BG:
186       if (*p == ':' || *p == '\r' || *p == '\n' || *p == ' ' || *p == '\t') {
187         R_THROW_ERROR(
188           "Invalid DESCRIPTION file, must start with an "
189           "alphanumeric character"
190         );
191       }
192       /* Otherwise it must be the start of a keyword */
193       kw = p++;
194       state = S_KW;
195 
196       break;
197 
198     /* -- within a keyword --------------------------------------------- */
199     case S_KW:
200       /* Maybe the keyword ends here, and a value starts */
201       if (*p == ':') {
202         kwsize = p - kw;
203         p++;
204         vl = p;
205         if (*vl == ' ') vl++;
206         state = S_VL;
207 
208       /* A newline within a keyword is an error */
209       } else if (*p == '\n') {
210         R_THROW_ERROR(
211           "Line %d invalid in DESCRIPTION: must be of form `key: value`",
212           linum
213         );
214 
215       /* Otherwise we are inside the keyword */
216       } else {
217         p++;
218       }
219 
220       break;
221 
222     /* --- within a value ---------------------------------------------- */
223     case S_VL:
224       /* newline might be the end of the value, if no continuation. */
225       if (*p == '\n') {
226         state = S_NL;
227         vlsize = p - vl;
228         p++;
229         linum++;
230 
231       } else {
232         p++;
233       }
234       break;
235 
236     /* -- right after a newline ---------------------------------------- */
237     case S_NL:
238       /* maybe a continuation line */
239       if (*p == ' ' || *p == '\t') {
240         state = S_WS;
241         p++;
242 
243       /* othewise we can save the field, and start parsing the next one */
244       } else {
245         SET_STRING_ELT(result, ridx, Rf_mkCharLenCE(vl, vlsize, CE_BYTES));
246         SET_STRING_ELT(names, ridx, Rf_mkCharLenCE(kw, kwsize, CE_NATIVE));
247         ridx++;
248         kw = p;
249         state = S_KW;
250         p++;
251       }
252 
253       break;
254 
255     /* -- after continuation space ------------------------------------- */
256     case S_WS:
257       /* more whitespace? */
258       if (*p == ' ' || *p == '\t') {
259         p++;
260 
261       /* otherwise continuation line, so this is still the value */
262       } else {
263         state = S_VL;
264         p++;
265       }
266       break;
267 
268     /* ----------------------------------------------------------------- */
269     default:
270       R_THROW_ERROR("Internal DESCRIPTION parser error"); // __NO_COVERAGE__
271       break;                                              // __NO_COVERAGE__
272     }
273   }
274 
275   if (state == S_KW) {
276     R_THROW_ERROR("DESCRIPTION file ended while parsing a key");
277   } else if (state != S_BG) {
278     /* Strip the trailing newline(s) */
279     while (p - 1 > start && *(p-1) == '\n') p--;
280     vlsize = p - vl;
281     SET_STRING_ELT(result, ridx, Rf_mkCharLenCE(vl, vlsize, CE_BYTES));
282     SET_STRING_ELT(names, ridx, Rf_mkCharLenCE(kw, kwsize, CE_NATIVE));
283     ridx++;
284   }
285 
286   Rf_setAttrib(result, R_NamesSymbol, names);
287   SEXP final = PROTECT(Rf_lengthgets(result, ridx));
288 
289   UNPROTECT(3);
290   return final;
291 }
292 
293 /* --------------------------------------------------------------------- */
294 
pkgcache_parse_description(SEXP path)295 SEXP pkgcache_parse_description(SEXP path) {
296   SEXP raw = PROTECT(pkgcache__read_file_raw(CHAR(STRING_ELT(path, 0))));
297   if (TYPEOF(raw) != RAWSXP) {
298     R_THROW_ERROR(CHAR(STRING_ELT(raw, 0)));
299   }
300 
301   SEXP desc = PROTECT(pkgcache_parse_description_raw(raw));
302 
303   UNPROTECT(2);
304   return desc;
305 }
306 
307 /* --------------------------------------------------------------------- */
308 
pkgcache_parse_packages_raw(SEXP raw)309 SEXP pkgcache_parse_packages_raw(SEXP raw) {
310   int len = LENGTH(raw);
311   char *p = NULL;
312   int npkgs = 1;
313 
314   if (len == 0) return R_NilValue;
315 
316   /* ------------------------------------------------------------------- */
317   /* Count number of empty lines, to guess the number of packages */
318   p = (char*) RAW(raw);
319   char tail = p[len - 1];
320   p[len - 1] = '\0';
321 
322   /* Skip whitespace first, check for empty file */
323 
324   while (*p == '\n' || *p == '\r') p++;
325   if (*p == '\0') return R_NilValue;
326 
327   /* This is faster than manual search, because strchr is optimized.
328      It is also faster than strstr, for this special case of a two
329      character pattern. */
330 
331   for (;;) {
332     p = strchr(p, '\n');
333     if (p == NULL) break;
334     p++;
335     if (*p == '\n' || *p == '\r') {
336       p++;
337       npkgs++;
338       while (*p == '\n' || *p == '\r') p++;
339       if (*p == '\0') npkgs--;
340     }
341   }
342 
343   /* ------------------------------------------------------------------- */
344 
345   int state = S_BG;
346   char *kw = NULL, *vl = NULL;
347   int kwsize = 0, vlsize = 0;
348   int linum = 1;
349   int max_cols = 1000;
350 
351   SEXP nms = PROTECT(allocVector(STRSXP, max_cols));
352   SEXP cols = PROTECT(allocVector(VECSXP, max_cols));
353   SEXP tab = PROTECT(allocVector(INTSXP, HASH_SIZE * MAX_COLL));
354   struct hash_table table;
355   hash_create(&table, nms, cols, tab, max_cols, npkgs);
356   int npkg = 0;
357 
358   p = (char*) RAW(raw);
359   while (*p != '\0') {
360     switch (state) {
361 
362     /* -- at the begining of a package --------------------------------- */
363     case S_BG:
364       if (*p == '\r') {
365         p++;
366       } else if (*p == '\n') {
367         linum++;
368         p++;
369       } else if (*p == ':' || *p == ' ' || *p == '\t') {
370         R_THROW_ERROR(
371           "Invalid PACKAGES file in line %d: expected key",
372           linum
373         );
374       } else {
375         kw = p++;
376         state = S_KW;
377       }
378       break;
379 
380     /* -- within a keyword --------------------------------------------- */
381     case S_KW:
382       if (*p == ':') {
383         kwsize = p - kw;
384         p++;
385         vl = p;
386         if (*vl == ' ') vl++;   /* skip leading space */
387         state = S_VL;
388 
389       } else if (*p == '\n') {
390         R_THROW_ERROR(
391           "Invalid line %d in PACKAGES file: must contain `:`",
392           linum
393         );
394 
395       } else {
396         p++;
397       }
398 
399       break;
400 
401     /* --- within a value ---------------------------------------------- */
402     case S_VL:
403       /* newline might be the end of the value, if no continuation. */
404       if (*p == '\n') {
405         state = S_NL;
406         vlsize = p - vl;
407         p++;
408         linum++;
409 
410       } else {
411         p++;
412       }
413       break;
414 
415     /* -- right after a newline ---------------------------------------- */
416     case S_NL:
417 
418       /* maybe a continuation line */
419       if (*p == ' ' || *p == '\t') {
420         state = S_WS;
421         p++;
422 
423       /* end of field */
424       } else {
425         /* Save field */
426         SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
427         hash_update(&table, kw, kwsize, npkg, val, /* err */ 1);
428         UNPROTECT(1);
429 
430         /* end of package? */
431         if (*p == '\n') {
432           p++;
433           npkg++;
434           linum++;
435           state = S_BG;
436 
437         } else if (*p == '\r' && *(p+1) == '\n') {
438           p++;
439           p++;
440           npkg++;
441           linum++;
442           state = S_BG;
443 
444         /* or just a new key */
445         } else {
446           kw = p;
447           p++;
448           state = S_KW;
449         }
450       }
451 
452       break;
453 
454     /* -- after continuation space ------------------------------------- */
455     case S_WS:
456       /* more whitespace? */
457       if (*p == ' ' || *p == '\t') {
458         p++;
459 
460       /* otherwise continuation line, so this is still the value */
461       } else {
462         state = S_VL;
463         p++;
464       }
465 
466       break;
467 
468     /* ----------------------------------------------------------------- */
469     default:
470       R_THROW_ERROR("Internal PACKAGES parser error");  // __NO_COVERAGE__
471       break;                                            // __NO_COVERAGE__
472     }
473   }
474 
475   vlsize = p - vl;
476   p = (char*) RAW(raw);
477   p[len - 1] = tail;
478   if (state == S_VL && tail != '\n') vlsize++;
479 
480   if (state == S_KW) {
481     R_THROW_ERROR("PACKAGES file ended while parsing a key");
482   } else if (state != S_BG) {
483     /* Save field */
484     SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
485     hash_update(&table, kw, kwsize, npkg, val, /* err= */ 1);
486     UNPROTECT(1);
487   }
488 
489   /* ------------------------------------------------------------------- */
490 
491   Rf_setAttrib(cols, R_NamesSymbol, nms);
492   SEXP final = PROTECT(Rf_lengthgets(cols, table.nfld));
493   UNPROTECT(4);
494   return final;
495 }
496 
497 /* --------------------------------------------------------------------- */
498 
pkgcache_parse_descriptions(SEXP paths,SEXP lowercase)499 SEXP pkgcache_parse_descriptions(SEXP paths, SEXP lowercase) {
500   int npkg, npkgs = LENGTH(paths);
501   int clowercase = LOGICAL(lowercase)[0];
502 
503   int state = S_BG;
504   char *kw = NULL, *vl = NULL;
505   int kwsize = 0, vlsize = 0;
506   int linum = 1;
507   int haserrors = 0;
508 
509   int max_cols = 1000;
510 
511   SEXP errors = PROTECT(allocVector(STRSXP, npkgs));
512   SEXP nms = PROTECT(allocVector(STRSXP, max_cols));
513   SEXP cols = PROTECT(allocVector(VECSXP, max_cols));
514   SEXP tab = PROTECT(allocVector(INTSXP, HASH_SIZE * MAX_COLL));
515   struct hash_table table;
516   hash_create(&table, nms, cols, tab, max_cols, npkgs);
517 
518   for (npkg = 0; npkg < npkgs; npkg++) {
519 
520     const char *cpath = CHAR(STRING_ELT(paths, npkg));
521     SEXP raw = PROTECT(pkgcache__read_file_raw(cpath));
522     if (TYPEOF(raw) != RAWSXP) {
523       SET_STRING_ELT(errors, npkg, STR1(raw));
524       UNPROTECT(1);
525       goto failedpkg;
526     }
527 
528     state = S_BG;
529     kw = NULL;
530     vl = NULL;
531     kwsize = 0;
532     vlsize = 0;
533     linum = 1;
534 
535     int len = LENGTH(raw);
536     char *p = (char*) RAW(raw);
537     char tail = p[len - 1];
538     p[len - 1] = '\0';
539 
540     while (*p != '\0') {
541       switch(state) {
542       /* -- at the begining -------------------------------------------- */
543       case S_BG:
544         if (*p == ':' || *p == '\r' || *p == '\n' || *p == ' ' || *p == '\t') {
545           SET_STRING_ELT(
546             errors,
547             npkg,
548             STR1(R_FORMAT_ERROR(
549               "`%s` is invalid, must start with an alphanumeric character",
550               cpath
551             ))
552           );
553           UNPROTECT(1);
554           goto failedpkg;
555         }
556         /* Otherwise it must be the start of a keyword */
557         if (clowercase) *p = tolower(*p);
558         kw = p++;
559         state = S_KW;
560 
561         break;
562 
563       /* -- within a keyword ------------------------------------------- */
564       case S_KW:
565         /* Maybe the keyword ends here, and a value starts */
566         if (*p == ':') {
567           kwsize = p - kw;
568           p++;
569           vl = p;
570           if (*vl == ' ') vl++;
571           state = S_VL;
572 
573         /* A newline within a keyword is an error */
574         } else if (*p == '\n') {
575           SET_STRING_ELT(
576             errors,
577             npkg,
578             STR1(R_FORMAT_ERROR(
579               "Line %d is invalid in `%s`: must contain `:`",
580               linum,
581               cpath
582             ))
583           );
584           UNPROTECT(1);
585           goto failedpkg;
586 
587         /* Otherwise we are inside the keyword */
588         } else {
589           if (clowercase) *p = tolower(*p);
590           p++;
591         }
592 
593         break;
594 
595       /* --- within a value -------------------------------------------- */
596       case S_VL:
597         if (*p == '\n') {
598           state = S_NL;
599           vlsize = p - vl;
600           p++;
601           linum++;
602 
603         } else {
604           p++;
605         }
606 
607         break;
608 
609       /* -- right after a newline -------------------------------------- */
610       case S_NL:
611         /* maybe a continuation line */
612         if (*p == ' ' || *p == '\t') {
613           state = S_WS;
614           p++;
615 
616         /* othewise we can save the field, and start parsing the next one */
617         } else {
618           SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
619           hash_update(&table, kw, kwsize, npkg, val, 1);
620           UNPROTECT(1);
621 
622           kw = p;
623           state = S_KW;
624           if (clowercase) *p = tolower(*p);
625           p++;
626         }
627 
628         break;
629 
630       /* -- after continuation space ----------------------------------- */
631       case S_WS:
632         /* more whitespace? */
633         if (*p == ' ' || *p == '\t') {
634           p++;
635 
636         /* otherwise continuation line, so this is still the value */
637         } else {
638           state = S_VL;
639           p++;
640         }
641 
642         break;
643 
644       /* --------------------------------------------------------------- */
645       default:
646         R_THROW_ERROR("Internal DESCRIPTION parser error");  // __NO_COVERAGE__
647         break;                                               // __NO_COVERAGE__
648       }
649     }
650 
651     vlsize = p - vl;
652     p = (char*) RAW(raw);
653     p[len - 1] = tail;
654     if (state == S_VL && tail != '\n') vlsize++;
655 
656     if (state == S_KW) {
657       SET_STRING_ELT(
658         errors,
659         npkg,
660         STR1(R_FORMAT_ERROR(
661           "`%s` ended while parsing a key",
662           cpath
663         ))
664       );
665       UNPROTECT(1);
666       goto failedpkg;
667 
668     } else {
669       /* Save field */
670       SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
671       hash_update(&table, kw, kwsize, npkg, val, /* err = */ 1);
672       UNPROTECT(1);
673     }
674 
675     UNPROTECT(1);
676     continue;
677 
678   failedpkg:
679     haserrors = 1;
680   }
681 
682   Rf_setAttrib(cols, R_NamesSymbol, nms);
683   SEXP final = PROTECT(allocVector(VECSXP, 3));
684   SET_VECTOR_ELT(final, 0, Rf_lengthgets(cols, table.nfld));
685   SET_VECTOR_ELT(final, 1, errors);
686   SET_VECTOR_ELT(final, 2, ScalarLogical(haserrors));
687 
688   UNPROTECT(5);
689   return final;
690 }
691