1
2 #include "pkgcache.h"
3 #include "errors.h"
4 #include "winfiles.h"
5
6 #include <fcntl.h>
7 #include <unistd.h>
8 #include <errno.h>
9 #include <ctype.h>
10
11 #define STR1(x) STRING_ELT(x, 0)
12 #define HASH_SIZE 256
13 #define MAX_COLL 10
14
15 #define ERROR_TOO_MANY_COLUMNS 1
16 #define ERROR_HASH_TABLE_FULL 2
17
hash_string(char * str,int strlen)18 static R_INLINE int hash_string(char *str, int strlen) {
19 int backup = str[strlen];
20 str[strlen] = '\0';
21 unsigned long hash = 5381;
22 int c;
23 while ((c = *str++)) {
24 hash = ((hash << 5) + hash) + c; /* hash * 33 + c */
25 }
26
27 str--;
28 *str = backup;
29 return hash % HASH_SIZE;
30 }
31
32 struct hash_table {
33 SEXP nms;
34 SEXP *nmsptr;
35 SEXP cols;
36 int *tab;
37 int tablen;
38 int nfld;
39 int max_cols;
40 int npkgs;
41 };
42
hash_create(struct hash_table * table,SEXP nms,SEXP cols,SEXP tab,int max_cols,int npkgs)43 static void hash_create(struct hash_table *table, SEXP nms, SEXP cols,
44 SEXP tab, int max_cols, int npkgs) {
45 table->nms = nms;
46 table->nmsptr = STRING_PTR(nms);
47 table->cols = cols;
48 table->tab = INTEGER(tab);
49 table->tablen = LENGTH(tab);
50 table->nfld = 0;
51 table->max_cols = max_cols;
52 table->npkgs = npkgs;
53 memset(table->tab, 0, sizeof(int) * table->tablen);
54 }
55
hash_update(struct hash_table * table,char * key,int keylen,int npkg,SEXP val,int err)56 static inline int hash_update(struct hash_table *table, char *key, int keylen,
57 int npkg, SEXP val, int err) {
58 int len = table->tablen;
59 int *t = table->tab;
60 int hash = hash_string(key, keylen);
61 int start = hash * MAX_COLL;
62
63 for (; start < len; start++) {
64 int p = t[start];
65 if (p == 0) {
66
67 if (table->nfld == table->max_cols) {
68 if (err) { // __NO_COVERAGE__
69 R_THROW_ERROR( // __NO_COVERAGE__
70 "Internal pkgcache error, too many different fields in " // __NO_COVERAGE__
71 "PACKAGES or DESCRIPTION data, please report a bug" // __NO_COVERAGE__
72 ); // __NO_COVERAGE__
73 } // __NO_COVERAGE__
74 return ERROR_TOO_MANY_COLUMNS; // __NO_COVERAGE__
75 }
76
77 SET_STRING_ELT(table->nms, table->nfld, Rf_mkCharLenCE(key, keylen, CE_NATIVE));
78 SET_VECTOR_ELT(table->cols, table->nfld, allocVector(STRSXP, table->npkgs));
79 SET_STRING_ELT(VECTOR_ELT(table->cols, table->nfld), npkg, val);
80 table->nfld += 1;
81 t[start] = table->nfld;
82 return 0;
83
84 } else {
85 p--;
86 if (!strncmp(key, CHAR(table->nmsptr[p]), keylen)) {
87 SET_STRING_ELT(VECTOR_ELT(table->cols, p), npkg, val);
88 return 0;
89 }
90 }
91 } // __NO_COVERAGE__
92
93 if (err) { // __NO_COVERAGE__
94 R_THROW_ERROR( // __NO_COVERAGE__
95 "Internal pkgcache error, hash table is full, please report a bug" // __NO_COVERAGE__
96 ); // __NO_COVERAGE__
97 } // __NO_COVERAGE__
98 return ERROR_HASH_TABLE_FULL; // __NO_COVERAGE__
99 }
100
101 /* --------------------------------------------------------------------- */
102
pkgcache__read_file_raw(const char * cpath)103 SEXP pkgcache__read_file_raw(const char *cpath) {
104 SEXP result = R_NilValue;
105 int err;
106 int fd = open_file(cpath, O_RDONLY);
107
108 if (fd == -1) {
109 return(R_FORMAT_SYSTEM_ERROR("Cannot open file `%s`", cpath));
110 }
111
112 off_t len = lseek(fd, 0, SEEK_END);
113 if (len == -1) {
114 err = errno; // __NO_COVERAGE__
115 close(fd); // __NO_COVERAGE__
116 return R_FORMAT_SYSTEM_ERROR_CODE(err, "Cannot seek `%s`", cpath); // __NO_COVERAGE__
117 }
118 off_t len2 = lseek(fd, 0, SEEK_SET);
119 if (len2 == -1) {
120 err = errno; // __NO_COVERAGE__
121 close(fd); // __NO_COVERAGE__
122 return R_FORMAT_SYSTEM_ERROR_CODE(err, "Cannot seek `%s`", cpath); // __NO_COVERAGE__
123 }
124
125 /* TODO: should use cleancall to close the file if allocVector fails */
126
127 result = PROTECT(allocVector(RAWSXP, len));
128
129 ssize_t ret = read(fd, RAW(result), len);
130 if (ret == -1) {
131 err = errno; // __NO_COVERAGE__
132 close(fd); // __NO_COVERAGE__
133 UNPROTECT(1); // __NO_COVERAGE__
134 return R_FORMAT_SYSTEM_ERROR_CODE(err, "Cannot read `%s`", cpath); // __NO_COVERAGE__
135 }
136
137 close(fd);
138
139 UNPROTECT(1);
140 return result;
141 }
142
143 /* --------------------------------------------------------------------- */
144
pkgcache_read_raw(SEXP paths)145 SEXP pkgcache_read_raw(SEXP paths) {
146 R_xlen_t i, len = XLENGTH(paths);
147 SEXP result = PROTECT(allocVector(VECSXP, len));
148
149 for (i = 0; i < len; i++) {
150 SET_VECTOR_ELT(
151 result,
152 i,
153 pkgcache__read_file_raw(CHAR(STRING_ELT(paths, i)))
154 );
155 }
156
157 UNPROTECT(1);
158 return result;
159 }
160
161 /* --------------------------------------------------------------------- */
162
163 #define S_BG 0 /* beginning of the file */
164 #define S_KW 1 /* inside a keyword */
165 #define S_VL 2 /* inside a value */
166 #define S_NL 3 /* right after a newline */
167 #define S_WS 4 /* after newline + space */
168
pkgcache_parse_description_raw(SEXP raw)169 SEXP pkgcache_parse_description_raw(SEXP raw) {
170 char *p = NULL, *start = (char*) RAW(raw);
171 char *end = start + XLENGTH(raw);
172 int state = S_BG;
173 char *kw = NULL, *vl = NULL;
174 int kwsize = 0, vlsize = 0;
175 int linum = 1;
176
177 SEXP result = PROTECT(allocVector(STRSXP, 200));
178 SEXP names = PROTECT(allocVector(STRSXP, 200));
179 int ridx = 0;
180
181 for (p = start; p < end; ) {
182 switch (state) {
183
184 /* -- at the begining ---------------------------------------------- */
185 case S_BG:
186 if (*p == ':' || *p == '\r' || *p == '\n' || *p == ' ' || *p == '\t') {
187 R_THROW_ERROR(
188 "Invalid DESCRIPTION file, must start with an "
189 "alphanumeric character"
190 );
191 }
192 /* Otherwise it must be the start of a keyword */
193 kw = p++;
194 state = S_KW;
195
196 break;
197
198 /* -- within a keyword --------------------------------------------- */
199 case S_KW:
200 /* Maybe the keyword ends here, and a value starts */
201 if (*p == ':') {
202 kwsize = p - kw;
203 p++;
204 vl = p;
205 if (*vl == ' ') vl++;
206 state = S_VL;
207
208 /* A newline within a keyword is an error */
209 } else if (*p == '\n') {
210 R_THROW_ERROR(
211 "Line %d invalid in DESCRIPTION: must be of form `key: value`",
212 linum
213 );
214
215 /* Otherwise we are inside the keyword */
216 } else {
217 p++;
218 }
219
220 break;
221
222 /* --- within a value ---------------------------------------------- */
223 case S_VL:
224 /* newline might be the end of the value, if no continuation. */
225 if (*p == '\n') {
226 state = S_NL;
227 vlsize = p - vl;
228 p++;
229 linum++;
230
231 } else {
232 p++;
233 }
234 break;
235
236 /* -- right after a newline ---------------------------------------- */
237 case S_NL:
238 /* maybe a continuation line */
239 if (*p == ' ' || *p == '\t') {
240 state = S_WS;
241 p++;
242
243 /* othewise we can save the field, and start parsing the next one */
244 } else {
245 SET_STRING_ELT(result, ridx, Rf_mkCharLenCE(vl, vlsize, CE_BYTES));
246 SET_STRING_ELT(names, ridx, Rf_mkCharLenCE(kw, kwsize, CE_NATIVE));
247 ridx++;
248 kw = p;
249 state = S_KW;
250 p++;
251 }
252
253 break;
254
255 /* -- after continuation space ------------------------------------- */
256 case S_WS:
257 /* more whitespace? */
258 if (*p == ' ' || *p == '\t') {
259 p++;
260
261 /* otherwise continuation line, so this is still the value */
262 } else {
263 state = S_VL;
264 p++;
265 }
266 break;
267
268 /* ----------------------------------------------------------------- */
269 default:
270 R_THROW_ERROR("Internal DESCRIPTION parser error"); // __NO_COVERAGE__
271 break; // __NO_COVERAGE__
272 }
273 }
274
275 if (state == S_KW) {
276 R_THROW_ERROR("DESCRIPTION file ended while parsing a key");
277 } else if (state != S_BG) {
278 /* Strip the trailing newline(s) */
279 while (p - 1 > start && *(p-1) == '\n') p--;
280 vlsize = p - vl;
281 SET_STRING_ELT(result, ridx, Rf_mkCharLenCE(vl, vlsize, CE_BYTES));
282 SET_STRING_ELT(names, ridx, Rf_mkCharLenCE(kw, kwsize, CE_NATIVE));
283 ridx++;
284 }
285
286 Rf_setAttrib(result, R_NamesSymbol, names);
287 SEXP final = PROTECT(Rf_lengthgets(result, ridx));
288
289 UNPROTECT(3);
290 return final;
291 }
292
293 /* --------------------------------------------------------------------- */
294
pkgcache_parse_description(SEXP path)295 SEXP pkgcache_parse_description(SEXP path) {
296 SEXP raw = PROTECT(pkgcache__read_file_raw(CHAR(STRING_ELT(path, 0))));
297 if (TYPEOF(raw) != RAWSXP) {
298 R_THROW_ERROR(CHAR(STRING_ELT(raw, 0)));
299 }
300
301 SEXP desc = PROTECT(pkgcache_parse_description_raw(raw));
302
303 UNPROTECT(2);
304 return desc;
305 }
306
307 /* --------------------------------------------------------------------- */
308
pkgcache_parse_packages_raw(SEXP raw)309 SEXP pkgcache_parse_packages_raw(SEXP raw) {
310 int len = LENGTH(raw);
311 char *p = NULL;
312 int npkgs = 1;
313
314 if (len == 0) return R_NilValue;
315
316 /* ------------------------------------------------------------------- */
317 /* Count number of empty lines, to guess the number of packages */
318 p = (char*) RAW(raw);
319 char tail = p[len - 1];
320 p[len - 1] = '\0';
321
322 /* Skip whitespace first, check for empty file */
323
324 while (*p == '\n' || *p == '\r') p++;
325 if (*p == '\0') return R_NilValue;
326
327 /* This is faster than manual search, because strchr is optimized.
328 It is also faster than strstr, for this special case of a two
329 character pattern. */
330
331 for (;;) {
332 p = strchr(p, '\n');
333 if (p == NULL) break;
334 p++;
335 if (*p == '\n' || *p == '\r') {
336 p++;
337 npkgs++;
338 while (*p == '\n' || *p == '\r') p++;
339 if (*p == '\0') npkgs--;
340 }
341 }
342
343 /* ------------------------------------------------------------------- */
344
345 int state = S_BG;
346 char *kw = NULL, *vl = NULL;
347 int kwsize = 0, vlsize = 0;
348 int linum = 1;
349 int max_cols = 1000;
350
351 SEXP nms = PROTECT(allocVector(STRSXP, max_cols));
352 SEXP cols = PROTECT(allocVector(VECSXP, max_cols));
353 SEXP tab = PROTECT(allocVector(INTSXP, HASH_SIZE * MAX_COLL));
354 struct hash_table table;
355 hash_create(&table, nms, cols, tab, max_cols, npkgs);
356 int npkg = 0;
357
358 p = (char*) RAW(raw);
359 while (*p != '\0') {
360 switch (state) {
361
362 /* -- at the begining of a package --------------------------------- */
363 case S_BG:
364 if (*p == '\r') {
365 p++;
366 } else if (*p == '\n') {
367 linum++;
368 p++;
369 } else if (*p == ':' || *p == ' ' || *p == '\t') {
370 R_THROW_ERROR(
371 "Invalid PACKAGES file in line %d: expected key",
372 linum
373 );
374 } else {
375 kw = p++;
376 state = S_KW;
377 }
378 break;
379
380 /* -- within a keyword --------------------------------------------- */
381 case S_KW:
382 if (*p == ':') {
383 kwsize = p - kw;
384 p++;
385 vl = p;
386 if (*vl == ' ') vl++; /* skip leading space */
387 state = S_VL;
388
389 } else if (*p == '\n') {
390 R_THROW_ERROR(
391 "Invalid line %d in PACKAGES file: must contain `:`",
392 linum
393 );
394
395 } else {
396 p++;
397 }
398
399 break;
400
401 /* --- within a value ---------------------------------------------- */
402 case S_VL:
403 /* newline might be the end of the value, if no continuation. */
404 if (*p == '\n') {
405 state = S_NL;
406 vlsize = p - vl;
407 p++;
408 linum++;
409
410 } else {
411 p++;
412 }
413 break;
414
415 /* -- right after a newline ---------------------------------------- */
416 case S_NL:
417
418 /* maybe a continuation line */
419 if (*p == ' ' || *p == '\t') {
420 state = S_WS;
421 p++;
422
423 /* end of field */
424 } else {
425 /* Save field */
426 SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
427 hash_update(&table, kw, kwsize, npkg, val, /* err */ 1);
428 UNPROTECT(1);
429
430 /* end of package? */
431 if (*p == '\n') {
432 p++;
433 npkg++;
434 linum++;
435 state = S_BG;
436
437 } else if (*p == '\r' && *(p+1) == '\n') {
438 p++;
439 p++;
440 npkg++;
441 linum++;
442 state = S_BG;
443
444 /* or just a new key */
445 } else {
446 kw = p;
447 p++;
448 state = S_KW;
449 }
450 }
451
452 break;
453
454 /* -- after continuation space ------------------------------------- */
455 case S_WS:
456 /* more whitespace? */
457 if (*p == ' ' || *p == '\t') {
458 p++;
459
460 /* otherwise continuation line, so this is still the value */
461 } else {
462 state = S_VL;
463 p++;
464 }
465
466 break;
467
468 /* ----------------------------------------------------------------- */
469 default:
470 R_THROW_ERROR("Internal PACKAGES parser error"); // __NO_COVERAGE__
471 break; // __NO_COVERAGE__
472 }
473 }
474
475 vlsize = p - vl;
476 p = (char*) RAW(raw);
477 p[len - 1] = tail;
478 if (state == S_VL && tail != '\n') vlsize++;
479
480 if (state == S_KW) {
481 R_THROW_ERROR("PACKAGES file ended while parsing a key");
482 } else if (state != S_BG) {
483 /* Save field */
484 SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
485 hash_update(&table, kw, kwsize, npkg, val, /* err= */ 1);
486 UNPROTECT(1);
487 }
488
489 /* ------------------------------------------------------------------- */
490
491 Rf_setAttrib(cols, R_NamesSymbol, nms);
492 SEXP final = PROTECT(Rf_lengthgets(cols, table.nfld));
493 UNPROTECT(4);
494 return final;
495 }
496
497 /* --------------------------------------------------------------------- */
498
pkgcache_parse_descriptions(SEXP paths,SEXP lowercase)499 SEXP pkgcache_parse_descriptions(SEXP paths, SEXP lowercase) {
500 int npkg, npkgs = LENGTH(paths);
501 int clowercase = LOGICAL(lowercase)[0];
502
503 int state = S_BG;
504 char *kw = NULL, *vl = NULL;
505 int kwsize = 0, vlsize = 0;
506 int linum = 1;
507 int haserrors = 0;
508
509 int max_cols = 1000;
510
511 SEXP errors = PROTECT(allocVector(STRSXP, npkgs));
512 SEXP nms = PROTECT(allocVector(STRSXP, max_cols));
513 SEXP cols = PROTECT(allocVector(VECSXP, max_cols));
514 SEXP tab = PROTECT(allocVector(INTSXP, HASH_SIZE * MAX_COLL));
515 struct hash_table table;
516 hash_create(&table, nms, cols, tab, max_cols, npkgs);
517
518 for (npkg = 0; npkg < npkgs; npkg++) {
519
520 const char *cpath = CHAR(STRING_ELT(paths, npkg));
521 SEXP raw = PROTECT(pkgcache__read_file_raw(cpath));
522 if (TYPEOF(raw) != RAWSXP) {
523 SET_STRING_ELT(errors, npkg, STR1(raw));
524 UNPROTECT(1);
525 goto failedpkg;
526 }
527
528 state = S_BG;
529 kw = NULL;
530 vl = NULL;
531 kwsize = 0;
532 vlsize = 0;
533 linum = 1;
534
535 int len = LENGTH(raw);
536 char *p = (char*) RAW(raw);
537 char tail = p[len - 1];
538 p[len - 1] = '\0';
539
540 while (*p != '\0') {
541 switch(state) {
542 /* -- at the begining -------------------------------------------- */
543 case S_BG:
544 if (*p == ':' || *p == '\r' || *p == '\n' || *p == ' ' || *p == '\t') {
545 SET_STRING_ELT(
546 errors,
547 npkg,
548 STR1(R_FORMAT_ERROR(
549 "`%s` is invalid, must start with an alphanumeric character",
550 cpath
551 ))
552 );
553 UNPROTECT(1);
554 goto failedpkg;
555 }
556 /* Otherwise it must be the start of a keyword */
557 if (clowercase) *p = tolower(*p);
558 kw = p++;
559 state = S_KW;
560
561 break;
562
563 /* -- within a keyword ------------------------------------------- */
564 case S_KW:
565 /* Maybe the keyword ends here, and a value starts */
566 if (*p == ':') {
567 kwsize = p - kw;
568 p++;
569 vl = p;
570 if (*vl == ' ') vl++;
571 state = S_VL;
572
573 /* A newline within a keyword is an error */
574 } else if (*p == '\n') {
575 SET_STRING_ELT(
576 errors,
577 npkg,
578 STR1(R_FORMAT_ERROR(
579 "Line %d is invalid in `%s`: must contain `:`",
580 linum,
581 cpath
582 ))
583 );
584 UNPROTECT(1);
585 goto failedpkg;
586
587 /* Otherwise we are inside the keyword */
588 } else {
589 if (clowercase) *p = tolower(*p);
590 p++;
591 }
592
593 break;
594
595 /* --- within a value -------------------------------------------- */
596 case S_VL:
597 if (*p == '\n') {
598 state = S_NL;
599 vlsize = p - vl;
600 p++;
601 linum++;
602
603 } else {
604 p++;
605 }
606
607 break;
608
609 /* -- right after a newline -------------------------------------- */
610 case S_NL:
611 /* maybe a continuation line */
612 if (*p == ' ' || *p == '\t') {
613 state = S_WS;
614 p++;
615
616 /* othewise we can save the field, and start parsing the next one */
617 } else {
618 SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
619 hash_update(&table, kw, kwsize, npkg, val, 1);
620 UNPROTECT(1);
621
622 kw = p;
623 state = S_KW;
624 if (clowercase) *p = tolower(*p);
625 p++;
626 }
627
628 break;
629
630 /* -- after continuation space ----------------------------------- */
631 case S_WS:
632 /* more whitespace? */
633 if (*p == ' ' || *p == '\t') {
634 p++;
635
636 /* otherwise continuation line, so this is still the value */
637 } else {
638 state = S_VL;
639 p++;
640 }
641
642 break;
643
644 /* --------------------------------------------------------------- */
645 default:
646 R_THROW_ERROR("Internal DESCRIPTION parser error"); // __NO_COVERAGE__
647 break; // __NO_COVERAGE__
648 }
649 }
650
651 vlsize = p - vl;
652 p = (char*) RAW(raw);
653 p[len - 1] = tail;
654 if (state == S_VL && tail != '\n') vlsize++;
655
656 if (state == S_KW) {
657 SET_STRING_ELT(
658 errors,
659 npkg,
660 STR1(R_FORMAT_ERROR(
661 "`%s` ended while parsing a key",
662 cpath
663 ))
664 );
665 UNPROTECT(1);
666 goto failedpkg;
667
668 } else {
669 /* Save field */
670 SEXP val = PROTECT(mkCharLenCE(vl, vlsize, CE_BYTES));
671 hash_update(&table, kw, kwsize, npkg, val, /* err = */ 1);
672 UNPROTECT(1);
673 }
674
675 UNPROTECT(1);
676 continue;
677
678 failedpkg:
679 haserrors = 1;
680 }
681
682 Rf_setAttrib(cols, R_NamesSymbol, nms);
683 SEXP final = PROTECT(allocVector(VECSXP, 3));
684 SET_VECTOR_ELT(final, 0, Rf_lengthgets(cols, table.nfld));
685 SET_VECTOR_ELT(final, 1, errors);
686 SET_VECTOR_ELT(final, 2, ScalarLogical(haserrors));
687
688 UNPROTECT(5);
689 return final;
690 }
691