1 /*
2  *  R : A Computer Language for Statistical Data Analysis
3  *  Copyright (C) 2001-2018   The R Core Team.
4  *
5  *  This program is free software; you can redistribute it and/or modify
6  *  it under the terms of the GNU General Public License as published by
7  *  the Free Software Foundation; either version 2 of the License, or
8  *  (at your option) any later version.
9  *
10  *  This program is distributed in the hope that it will be useful,
11  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
12  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
13  *  GNU General Public License for more details.
14  *
15  *  You should have received a copy of the GNU General Public License
16  *  along with this program; if not, a copy is available at
17  *  https://www.R-project.org/Licenses/
18  */
19 
20 #ifdef HAVE_CONFIG_H
21 # include <config.h>
22 #endif
23 
24 #define R_USE_SIGNALS 1
25 #include <Defn.h>
26 #include <Internal.h>
27 #include <Rconnections.h>
28 
29 #include <tre/tre.h>
30 
31 static SEXP allocMatrixNA(SEXPTYPE, int, int);
32 static void transferVector(SEXP s, SEXP t);
33 
con_cleanup(void * data)34 static void con_cleanup(void *data)
35 {
36     Rconnection con = data;
37     if(con->isopen) con->close(con);
38 }
39 
40 static Rboolean field_is_foldable_p(const char *, SEXP);
41 
42 /* Use R_alloc as this might get interrupted */
Rconn_getline2(Rconnection con,char * buf,int bufsize)43 static char *Rconn_getline2(Rconnection con, char *buf, int bufsize)
44 {
45     int c, nbuf = 0;
46     while((c = Rconn_fgetc(con)) != R_EOF) {
47 	if(nbuf+1 >= bufsize) { // allow for terminator below
48 	    bufsize *= 2;
49 	    char *buf2 = R_alloc(bufsize, sizeof(char));
50 	    memcpy(buf2, buf, nbuf);
51 	    buf = buf2;
52 	}
53 	if(c != '\n'){
54 	    buf[nbuf++] = (char) c;
55 	} else {
56 	    buf[nbuf++] = '\0';
57 	    break;
58 	}
59     }
60     if (!nbuf)
61     	return NULL;
62     /* Make sure it is null-terminated even if file did not end with
63      *  newline.
64      */
65     if(buf[nbuf-1]) buf[nbuf] = '\0';
66     return buf;
67 }
68 
do_readDCF(SEXP call,SEXP op,SEXP args,SEXP env)69 SEXP attribute_hidden do_readDCF(SEXP call, SEXP op, SEXP args, SEXP env)
70 {
71     int nwhat, nret, nc, nr, m, k, lastm, need, i, n_eblanklines = 0;
72     Rboolean blank_skip, field_skip = FALSE;
73     int whatlen, dynwhat, buflen = 8096; // was 100, but that re-alloced often
74     char *line, *buf;
75     regex_t blankline, contline, trailblank, regline, eblankline;
76     regmatch_t regmatch[1];
77     SEXP file, what, what2, retval, retval2, dims, dimnames;
78     Rconnection con = NULL;
79     Rboolean wasopen, is_eblankline;
80     RCNTXT cntxt;
81 
82     SEXP fold_excludes;
83     Rboolean field_fold = TRUE, has_fold_excludes;
84     const char *field_name;
85     int offset = 0; /* -Wall */
86 
87     checkArity(op, args);
88 
89     file = CAR(args);
90     con = getConnection(asInteger(file));
91     wasopen = con->isopen;
92     if(!wasopen) {
93 	if(!con->open(con)) error(_("cannot open the connection"));
94 	/* Set up a context which will close the connection on error */
95 	begincontext(&cntxt, CTXT_CCODE, R_NilValue, R_BaseEnv, R_BaseEnv,
96 		     R_NilValue, R_NilValue);
97 	cntxt.cend = &con_cleanup;
98 	cntxt.cenddata = con;
99     }
100     if(!con->canread) error(_("cannot read from this connection"));
101 
102     args = CDR(args);
103     PROTECT(what = coerceVector(CAR(args), STRSXP)); /* argument fields */
104     nwhat = LENGTH(what);
105     dynwhat = (nwhat == 0);
106 
107     args = CDR(args);
108     PROTECT(fold_excludes = coerceVector(CAR(args), STRSXP));
109     has_fold_excludes = (LENGTH(fold_excludes) > 0);
110 
111     buf = (char *) malloc(buflen);
112     if(!buf) error(_("could not allocate memory for 'read.dcf'"));
113     nret = 20;
114     /* it is easier if we first have a record per column */
115     PROTECT(retval = allocMatrixNA(STRSXP, LENGTH(what), nret));
116 
117     /* These used to use [:blank:] but that can match \xa0 as part of
118        a UTF-8 character (and is nbspace on Windows). */
119     tre_regcomp(&blankline, "^[[:blank:]]*$", REG_NOSUB & REG_EXTENDED);
120     tre_regcomp(&trailblank, "[ \t]+$", REG_EXTENDED);
121     tre_regcomp(&contline, "^[[:blank:]]+", REG_EXTENDED);
122     tre_regcomp(&regline, "^[^:]+:[[:blank:]]*", REG_EXTENDED);
123     tre_regcomp(&eblankline, "^[[:space:]]+\\.[[:space:]]*$", REG_EXTENDED);
124 
125     k = 0;
126     lastm = -1; /* index of the field currently being recorded */
127     blank_skip = TRUE;
128     void *vmax = vmaxget();
129     char buf0[MAXELTSIZE];
130     while((line = Rconn_getline2(con, buf0, MAXELTSIZE))) {
131 	if(strlen(line) == 0 ||
132 	   tre_regexecb(&blankline, line, 0, 0, 0) == 0) {
133 	    /* A blank line.  The first one after a record ends a new
134 	     * record, subsequent ones are skipped */
135 	    if(!blank_skip) {
136 		k++;
137 		if(k > nret - 1){
138 		    nret *= 2;
139 		    PROTECT(retval2 = allocMatrixNA(STRSXP, LENGTH(what), nret));
140 		    transferVector(retval2, retval);
141 		    retval = retval2;
142 		    UNPROTECT(2); /* retval, retval2 */
143 		    PROTECT(retval);
144 		}
145 		blank_skip = TRUE;
146 		lastm = -1;
147 		field_skip = FALSE;
148 		field_fold = TRUE;
149 		n_eblanklines = 0;
150 	    }
151 	} else {
152 	    blank_skip = FALSE;
153 	    if(tre_regexecb(&contline, line, 1, regmatch, 0) == 0) {
154 		/* A continuation line: wrong if at the beginning of a
155 		   record. */
156 		if((lastm == -1) && !field_skip) {
157 		    line[20] = '\0';
158 		    error(_("Found continuation line starting '%s ...' at begin of record."),
159 			  line);
160 		}
161 		if(lastm >= 0) {
162 		    need = (int) strlen(CHAR(STRING_ELT(retval,
163 							lastm + nwhat * k))) + 2;
164 		    if(tre_regexecb(&eblankline, line, 0, NULL, 0) == 0) {
165 			is_eblankline = TRUE;
166 			if(field_fold) {
167 			    n_eblanklines++;
168 			    continue;
169 			}
170 		    } else {
171 			is_eblankline = FALSE;
172 			if(field_fold) {
173 			    offset = regmatch[0].rm_eo;
174 			    /* Also remove trailing whitespace. */
175 			    if((tre_regexecb(&trailblank, line, 1,
176 					     regmatch, 0) == 0))
177 				line[regmatch[0].rm_so] = '\0';
178 			} else {
179 			    offset = 0;
180 			}
181 			need += (int) strlen(line + offset) + n_eblanklines;
182 		    }
183 		    if(buflen < need) {
184 			char *tmp = (char *) realloc(buf, need);
185 			if(!tmp) {
186 			    free(buf);
187 			    error(_("could not allocate memory for 'read.dcf'"));
188 			} else buf = tmp;
189 			buflen = need;
190 		    }
191 		    strcpy(buf, CHAR(STRING_ELT(retval, lastm + nwhat * k)));
192 		    if(strlen(buf) || !field_fold)
193 			strcat(buf, "\n");
194 		    if(!is_eblankline) {
195 			if(n_eblanklines > 0) {
196 			    for(i = 0; i < n_eblanklines; i++) {
197 				strcat(buf, "\n");
198 			    }
199 			    n_eblanklines = 0;
200 			}
201 			strcat(buf, line + offset);
202 		    }
203 		    SET_STRING_ELT(retval, lastm + nwhat * k, mkChar(buf));
204 		}
205 	    } else {
206 		if(tre_regexecb(&regline, line, 1, regmatch, 0) == 0) {
207 		    for(m = 0; m < nwhat; m++){
208 			whatlen = (int) strlen(CHAR(STRING_ELT(what, m)));
209 			if(strlen(line) > whatlen &&
210 			   line[whatlen] == ':' &&
211 			   strncmp(CHAR(STRING_ELT(what, m)),
212 				   line, whatlen) == 0) {
213 			    /* An already known field we are recording. */
214 			    lastm = m;
215 			    field_skip = FALSE;
216 			    field_name = CHAR(STRING_ELT(what, lastm));
217 			    if(has_fold_excludes) {
218 				field_fold =
219 				    field_is_foldable_p(field_name,
220 							fold_excludes);
221 			    }
222 			    offset = regmatch[0].rm_eo;
223 			    if(field_fold) {
224 				/* Also remove trailing whitespace. */
225 				if((tre_regexecb(&trailblank, line, 1,
226 						 regmatch, 0) == 0))
227 				    line[regmatch[0].rm_so] = '\0';
228 			    }
229 			    SET_STRING_ELT(retval, m + nwhat * k,
230 					   mkChar(line + offset));
231 			    break;
232 			} else {
233 			    /* This is a field, but not one prespecified */
234 			    lastm = -1;
235 			    field_skip = TRUE;
236 			}
237 		    }
238 		    if(dynwhat && (lastm == -1)) {
239 			/* A previously unseen field and we are
240 			 * recording all fields */
241 			field_skip = FALSE;
242 			PROTECT(what2 = allocVector(STRSXP, nwhat+1));
243 			PROTECT(retval2 = allocMatrixNA(STRSXP,
244 							nrows(retval)+1,
245 							ncols(retval)));
246 			if(nwhat > 0) {
247 			    copyVector(what2, what);
248 			    for(nr = 0; nr < nrows(retval); nr++){
249 				for(nc = 0; nc < ncols(retval); nc++){
250 				    SET_STRING_ELT(retval2, nr+nc*nrows(retval2),
251 						   STRING_ELT(retval,
252 							      nr+nc*nrows(retval)));
253 				}
254 			    }
255 			}
256 			retval = retval2;
257 			what = what2;
258 			UNPROTECT(5); /* what, fold_excludes, retval, what2, retval2 */
259 			PROTECT(what);
260 			PROTECT(fold_excludes);
261 			PROTECT(retval);
262 			/* Make sure enough space was used */
263 			need = (int) (Rf_strchr(line, ':') - line + 1);
264 			if(buflen < need){
265 			    char *tmp = (char *) realloc(buf, need);
266 			    if(!tmp) {
267 				free(buf);
268 				error(_("could not allocate memory for 'read.dcf'"));
269 			    } else buf = tmp;
270 			    buflen = need;
271 			}
272 			strncpy(buf, line, Rf_strchr(line, ':') - line);
273 			buf[Rf_strchr(line, ':') - line] = '\0';
274 			SET_STRING_ELT(what, nwhat, mkChar(buf));
275 			nwhat++;
276 			/* lastm uses C indexing, hence nwhat - 1 */
277 			lastm = nwhat - 1;
278 			field_name = CHAR(STRING_ELT(what, lastm));
279 			if(has_fold_excludes) {
280 			    field_fold =
281 				field_is_foldable_p(field_name,
282 						    fold_excludes);
283 			}
284 			offset = regmatch[0].rm_eo;
285 			if(field_fold) {
286 			    /* Also remove trailing whitespace. */
287 			    if((tre_regexecb(&trailblank, line, 1,
288 					     regmatch, 0) == 0))
289 				line[regmatch[0].rm_so] = '\0';
290 			}
291 			SET_STRING_ELT(retval, lastm + nwhat * k,
292 				       mkChar(line + offset));
293 		    }
294 		} else {
295 		    /* Must be a regular line with no tag ... */
296 		    line[20] = '\0';
297 		    error(_("Line starting '%s ...' is malformed!"), line);
298 		}
299 	    }
300 	}
301     }
302     vmaxset(vmax);
303     if(!wasopen) {endcontext(&cntxt); con->close(con);}
304     free(buf);
305     tre_regfree(&blankline);
306     tre_regfree(&contline);
307     tre_regfree(&trailblank);
308     tre_regfree(&regline);
309     tre_regfree(&eblankline);
310 
311     if(!blank_skip) k++;
312 
313     /* and now transpose the whole matrix */
314     PROTECT(retval2 = allocMatrixNA(STRSXP, k, LENGTH(what)));
315     copyMatrix(retval2, retval, 1);
316 
317     PROTECT(dimnames = allocVector(VECSXP, 2));
318     PROTECT(dims = allocVector(INTSXP, 2));
319     INTEGER(dims)[0] = k;
320     INTEGER(dims)[1] = LENGTH(what);
321     SET_VECTOR_ELT(dimnames, 1, what);
322     setAttrib(retval2, R_DimSymbol, dims);
323     setAttrib(retval2, R_DimNamesSymbol, dimnames);
324     UNPROTECT(6); /* what, fold_excludes, retval, retval2, dimnames, dims */
325     return(retval2);
326 }
327 
328 
allocMatrixNA(SEXPTYPE mode,int nrow,int ncol)329 static SEXP allocMatrixNA(SEXPTYPE mode, int nrow, int ncol)
330 {
331     int k;
332     SEXP retval;
333 
334     PROTECT(retval = allocMatrix(mode, nrow, ncol));
335     for(k = 0; k < LENGTH(retval); k++)
336 	SET_STRING_ELT(retval, k, NA_STRING);
337     UNPROTECT(1);
338     return(retval);
339 }
340 
341 /* This one is needed because the normal copy operations will do
342    recycling */
343 
transferVector(SEXP s,SEXP t)344 static void transferVector(SEXP s, SEXP t)
345 {
346     for (int i = 0; i < LENGTH(t); i++)
347 	SET_STRING_ELT(s, i, STRING_ELT(t, i));
348 }
349 
field_is_foldable_p(const char * field,SEXP excludes)350 static Rboolean field_is_foldable_p(const char *field, SEXP excludes)
351 {
352     int i, n = LENGTH(excludes);
353     for(i = 0; i < n; i++) {
354 	if(strcmp(field, CHAR(STRING_ELT(excludes, i))) == 0)
355 	    return FALSE;
356     }
357     return TRUE;
358 }
359