1 /*
2 * R : A Computer Language for Statistical Data Analysis
3 * Copyright (C) 2001-2018 The R Core Team.
4 *
5 * This program is free software; you can redistribute it and/or modify
6 * it under the terms of the GNU General Public License as published by
7 * the Free Software Foundation; either version 2 of the License, or
8 * (at your option) any later version.
9 *
10 * This program is distributed in the hope that it will be useful,
11 * but WITHOUT ANY WARRANTY; without even the implied warranty of
12 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 * GNU General Public License for more details.
14 *
15 * You should have received a copy of the GNU General Public License
16 * along with this program; if not, a copy is available at
17 * https://www.R-project.org/Licenses/
18 */
19
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23
24 #define R_USE_SIGNALS 1
25 #include <Defn.h>
26 #include <Internal.h>
27 #include <Rconnections.h>
28
29 #include <tre/tre.h>
30
31 static SEXP allocMatrixNA(SEXPTYPE, int, int);
32 static void transferVector(SEXP s, SEXP t);
33
con_cleanup(void * data)34 static void con_cleanup(void *data)
35 {
36 Rconnection con = data;
37 if(con->isopen) con->close(con);
38 }
39
40 static Rboolean field_is_foldable_p(const char *, SEXP);
41
42 /* Use R_alloc as this might get interrupted */
Rconn_getline2(Rconnection con,char * buf,int bufsize)43 static char *Rconn_getline2(Rconnection con, char *buf, int bufsize)
44 {
45 int c, nbuf = 0;
46 while((c = Rconn_fgetc(con)) != R_EOF) {
47 if(nbuf+1 >= bufsize) { // allow for terminator below
48 bufsize *= 2;
49 char *buf2 = R_alloc(bufsize, sizeof(char));
50 memcpy(buf2, buf, nbuf);
51 buf = buf2;
52 }
53 if(c != '\n'){
54 buf[nbuf++] = (char) c;
55 } else {
56 buf[nbuf++] = '\0';
57 break;
58 }
59 }
60 if (!nbuf)
61 return NULL;
62 /* Make sure it is null-terminated even if file did not end with
63 * newline.
64 */
65 if(buf[nbuf-1]) buf[nbuf] = '\0';
66 return buf;
67 }
68
do_readDCF(SEXP call,SEXP op,SEXP args,SEXP env)69 SEXP attribute_hidden do_readDCF(SEXP call, SEXP op, SEXP args, SEXP env)
70 {
71 int nwhat, nret, nc, nr, m, k, lastm, need, i, n_eblanklines = 0;
72 Rboolean blank_skip, field_skip = FALSE;
73 int whatlen, dynwhat, buflen = 8096; // was 100, but that re-alloced often
74 char *line, *buf;
75 regex_t blankline, contline, trailblank, regline, eblankline;
76 regmatch_t regmatch[1];
77 SEXP file, what, what2, retval, retval2, dims, dimnames;
78 Rconnection con = NULL;
79 Rboolean wasopen, is_eblankline;
80 RCNTXT cntxt;
81
82 SEXP fold_excludes;
83 Rboolean field_fold = TRUE, has_fold_excludes;
84 const char *field_name;
85 int offset = 0; /* -Wall */
86
87 checkArity(op, args);
88
89 file = CAR(args);
90 con = getConnection(asInteger(file));
91 wasopen = con->isopen;
92 if(!wasopen) {
93 if(!con->open(con)) error(_("cannot open the connection"));
94 /* Set up a context which will close the connection on error */
95 begincontext(&cntxt, CTXT_CCODE, R_NilValue, R_BaseEnv, R_BaseEnv,
96 R_NilValue, R_NilValue);
97 cntxt.cend = &con_cleanup;
98 cntxt.cenddata = con;
99 }
100 if(!con->canread) error(_("cannot read from this connection"));
101
102 args = CDR(args);
103 PROTECT(what = coerceVector(CAR(args), STRSXP)); /* argument fields */
104 nwhat = LENGTH(what);
105 dynwhat = (nwhat == 0);
106
107 args = CDR(args);
108 PROTECT(fold_excludes = coerceVector(CAR(args), STRSXP));
109 has_fold_excludes = (LENGTH(fold_excludes) > 0);
110
111 buf = (char *) malloc(buflen);
112 if(!buf) error(_("could not allocate memory for 'read.dcf'"));
113 nret = 20;
114 /* it is easier if we first have a record per column */
115 PROTECT(retval = allocMatrixNA(STRSXP, LENGTH(what), nret));
116
117 /* These used to use [:blank:] but that can match \xa0 as part of
118 a UTF-8 character (and is nbspace on Windows). */
119 tre_regcomp(&blankline, "^[[:blank:]]*$", REG_NOSUB & REG_EXTENDED);
120 tre_regcomp(&trailblank, "[ \t]+$", REG_EXTENDED);
121 tre_regcomp(&contline, "^[[:blank:]]+", REG_EXTENDED);
122 tre_regcomp(®line, "^[^:]+:[[:blank:]]*", REG_EXTENDED);
123 tre_regcomp(&eblankline, "^[[:space:]]+\\.[[:space:]]*$", REG_EXTENDED);
124
125 k = 0;
126 lastm = -1; /* index of the field currently being recorded */
127 blank_skip = TRUE;
128 void *vmax = vmaxget();
129 char buf0[MAXELTSIZE];
130 while((line = Rconn_getline2(con, buf0, MAXELTSIZE))) {
131 if(strlen(line) == 0 ||
132 tre_regexecb(&blankline, line, 0, 0, 0) == 0) {
133 /* A blank line. The first one after a record ends a new
134 * record, subsequent ones are skipped */
135 if(!blank_skip) {
136 k++;
137 if(k > nret - 1){
138 nret *= 2;
139 PROTECT(retval2 = allocMatrixNA(STRSXP, LENGTH(what), nret));
140 transferVector(retval2, retval);
141 retval = retval2;
142 UNPROTECT(2); /* retval, retval2 */
143 PROTECT(retval);
144 }
145 blank_skip = TRUE;
146 lastm = -1;
147 field_skip = FALSE;
148 field_fold = TRUE;
149 n_eblanklines = 0;
150 }
151 } else {
152 blank_skip = FALSE;
153 if(tre_regexecb(&contline, line, 1, regmatch, 0) == 0) {
154 /* A continuation line: wrong if at the beginning of a
155 record. */
156 if((lastm == -1) && !field_skip) {
157 line[20] = '\0';
158 error(_("Found continuation line starting '%s ...' at begin of record."),
159 line);
160 }
161 if(lastm >= 0) {
162 need = (int) strlen(CHAR(STRING_ELT(retval,
163 lastm + nwhat * k))) + 2;
164 if(tre_regexecb(&eblankline, line, 0, NULL, 0) == 0) {
165 is_eblankline = TRUE;
166 if(field_fold) {
167 n_eblanklines++;
168 continue;
169 }
170 } else {
171 is_eblankline = FALSE;
172 if(field_fold) {
173 offset = regmatch[0].rm_eo;
174 /* Also remove trailing whitespace. */
175 if((tre_regexecb(&trailblank, line, 1,
176 regmatch, 0) == 0))
177 line[regmatch[0].rm_so] = '\0';
178 } else {
179 offset = 0;
180 }
181 need += (int) strlen(line + offset) + n_eblanklines;
182 }
183 if(buflen < need) {
184 char *tmp = (char *) realloc(buf, need);
185 if(!tmp) {
186 free(buf);
187 error(_("could not allocate memory for 'read.dcf'"));
188 } else buf = tmp;
189 buflen = need;
190 }
191 strcpy(buf, CHAR(STRING_ELT(retval, lastm + nwhat * k)));
192 if(strlen(buf) || !field_fold)
193 strcat(buf, "\n");
194 if(!is_eblankline) {
195 if(n_eblanklines > 0) {
196 for(i = 0; i < n_eblanklines; i++) {
197 strcat(buf, "\n");
198 }
199 n_eblanklines = 0;
200 }
201 strcat(buf, line + offset);
202 }
203 SET_STRING_ELT(retval, lastm + nwhat * k, mkChar(buf));
204 }
205 } else {
206 if(tre_regexecb(®line, line, 1, regmatch, 0) == 0) {
207 for(m = 0; m < nwhat; m++){
208 whatlen = (int) strlen(CHAR(STRING_ELT(what, m)));
209 if(strlen(line) > whatlen &&
210 line[whatlen] == ':' &&
211 strncmp(CHAR(STRING_ELT(what, m)),
212 line, whatlen) == 0) {
213 /* An already known field we are recording. */
214 lastm = m;
215 field_skip = FALSE;
216 field_name = CHAR(STRING_ELT(what, lastm));
217 if(has_fold_excludes) {
218 field_fold =
219 field_is_foldable_p(field_name,
220 fold_excludes);
221 }
222 offset = regmatch[0].rm_eo;
223 if(field_fold) {
224 /* Also remove trailing whitespace. */
225 if((tre_regexecb(&trailblank, line, 1,
226 regmatch, 0) == 0))
227 line[regmatch[0].rm_so] = '\0';
228 }
229 SET_STRING_ELT(retval, m + nwhat * k,
230 mkChar(line + offset));
231 break;
232 } else {
233 /* This is a field, but not one prespecified */
234 lastm = -1;
235 field_skip = TRUE;
236 }
237 }
238 if(dynwhat && (lastm == -1)) {
239 /* A previously unseen field and we are
240 * recording all fields */
241 field_skip = FALSE;
242 PROTECT(what2 = allocVector(STRSXP, nwhat+1));
243 PROTECT(retval2 = allocMatrixNA(STRSXP,
244 nrows(retval)+1,
245 ncols(retval)));
246 if(nwhat > 0) {
247 copyVector(what2, what);
248 for(nr = 0; nr < nrows(retval); nr++){
249 for(nc = 0; nc < ncols(retval); nc++){
250 SET_STRING_ELT(retval2, nr+nc*nrows(retval2),
251 STRING_ELT(retval,
252 nr+nc*nrows(retval)));
253 }
254 }
255 }
256 retval = retval2;
257 what = what2;
258 UNPROTECT(5); /* what, fold_excludes, retval, what2, retval2 */
259 PROTECT(what);
260 PROTECT(fold_excludes);
261 PROTECT(retval);
262 /* Make sure enough space was used */
263 need = (int) (Rf_strchr(line, ':') - line + 1);
264 if(buflen < need){
265 char *tmp = (char *) realloc(buf, need);
266 if(!tmp) {
267 free(buf);
268 error(_("could not allocate memory for 'read.dcf'"));
269 } else buf = tmp;
270 buflen = need;
271 }
272 strncpy(buf, line, Rf_strchr(line, ':') - line);
273 buf[Rf_strchr(line, ':') - line] = '\0';
274 SET_STRING_ELT(what, nwhat, mkChar(buf));
275 nwhat++;
276 /* lastm uses C indexing, hence nwhat - 1 */
277 lastm = nwhat - 1;
278 field_name = CHAR(STRING_ELT(what, lastm));
279 if(has_fold_excludes) {
280 field_fold =
281 field_is_foldable_p(field_name,
282 fold_excludes);
283 }
284 offset = regmatch[0].rm_eo;
285 if(field_fold) {
286 /* Also remove trailing whitespace. */
287 if((tre_regexecb(&trailblank, line, 1,
288 regmatch, 0) == 0))
289 line[regmatch[0].rm_so] = '\0';
290 }
291 SET_STRING_ELT(retval, lastm + nwhat * k,
292 mkChar(line + offset));
293 }
294 } else {
295 /* Must be a regular line with no tag ... */
296 line[20] = '\0';
297 error(_("Line starting '%s ...' is malformed!"), line);
298 }
299 }
300 }
301 }
302 vmaxset(vmax);
303 if(!wasopen) {endcontext(&cntxt); con->close(con);}
304 free(buf);
305 tre_regfree(&blankline);
306 tre_regfree(&contline);
307 tre_regfree(&trailblank);
308 tre_regfree(®line);
309 tre_regfree(&eblankline);
310
311 if(!blank_skip) k++;
312
313 /* and now transpose the whole matrix */
314 PROTECT(retval2 = allocMatrixNA(STRSXP, k, LENGTH(what)));
315 copyMatrix(retval2, retval, 1);
316
317 PROTECT(dimnames = allocVector(VECSXP, 2));
318 PROTECT(dims = allocVector(INTSXP, 2));
319 INTEGER(dims)[0] = k;
320 INTEGER(dims)[1] = LENGTH(what);
321 SET_VECTOR_ELT(dimnames, 1, what);
322 setAttrib(retval2, R_DimSymbol, dims);
323 setAttrib(retval2, R_DimNamesSymbol, dimnames);
324 UNPROTECT(6); /* what, fold_excludes, retval, retval2, dimnames, dims */
325 return(retval2);
326 }
327
328
allocMatrixNA(SEXPTYPE mode,int nrow,int ncol)329 static SEXP allocMatrixNA(SEXPTYPE mode, int nrow, int ncol)
330 {
331 int k;
332 SEXP retval;
333
334 PROTECT(retval = allocMatrix(mode, nrow, ncol));
335 for(k = 0; k < LENGTH(retval); k++)
336 SET_STRING_ELT(retval, k, NA_STRING);
337 UNPROTECT(1);
338 return(retval);
339 }
340
341 /* This one is needed because the normal copy operations will do
342 recycling */
343
transferVector(SEXP s,SEXP t)344 static void transferVector(SEXP s, SEXP t)
345 {
346 for (int i = 0; i < LENGTH(t); i++)
347 SET_STRING_ELT(s, i, STRING_ELT(t, i));
348 }
349
field_is_foldable_p(const char * field,SEXP excludes)350 static Rboolean field_is_foldable_p(const char *field, SEXP excludes)
351 {
352 int i, n = LENGTH(excludes);
353 for(i = 0; i < n; i++) {
354 if(strcmp(field, CHAR(STRING_ELT(excludes, i))) == 0)
355 return FALSE;
356 }
357 return TRUE;
358 }
359