1 /*
2  *  R : A Computer Language for Statistical Data Analysis
3  *  Copyright (C) 1997--2020  The R Core Team
4  *  Copyright (C) 1995, 1996  Robert Gentleman and Ross Ihaka
5  *
6  *  This program is free software; you can redistribute it and/or modify
7  *  it under the terms of the GNU General Public License as published by
8  *  the Free Software Foundation; either version 2 of the License, or
9  *  (at your option) any later version.
10  *
11  *  This program is distributed in the hope that it will be useful,
12  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
13  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
14  *  GNU General Public License for more details.
15  *
16  *  You should have received a copy of the GNU General Public License
17  *  along with this program; if not, a copy is available at
18  *  https://www.R-project.org/Licenses/
19  */
20 
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24 
25 #include <Defn.h>
26 
27 #include "statsR.h"
28 #undef _
29 #ifdef ENABLE_NLS
30 #include <libintl.h>
31 #define _(String) dgettext ("stats", String)
32 #else
33 #define _(String) (String)
34 #endif
35 
36 /* inline-able versions, used just once! */
isUnordered_int(SEXP s)37 static R_INLINE Rboolean isUnordered_int(SEXP s)
38 {
39     return (TYPEOF(s) == INTSXP
40 	    && inherits(s, "factor")
41 	    && !inherits(s, "ordered"));
42 }
43 
isOrdered_int(SEXP s)44 static R_INLINE Rboolean isOrdered_int(SEXP s)
45 {
46     return (TYPEOF(s) == INTSXP
47 	    && inherits(s, "factor")
48 	    && inherits(s, "ordered"));
49 }
50 
51 /*
52  *  model.frame
53  *
54  *  The argument "terms" contains the terms object generated from the
55  *  model formula.  We first evaluate the "variables" attribute of
56  *  "terms" in the "data" environment.  This gives us a list of basic
57  *  variables to be in the model frame.  We do some basic sanity
58  *  checks on these to ensure that resulting object make sense.
59  *
60  *  The argument "dots" gives additional things like "weights", "offsets"
61  *  and "subset" which will also go into the model frame so that they can
62  *  be treated in parallel.
63  *
64  *  Next we subset the data frame according to "subset" and finally apply
65  *  "na.action" to get the final data frame.
66  *
67  *  Note that the "terms" argument is glued to the model frame as an
68  *  attribute.  Code downstream appears to need this.
69  *
70  */
71 
72 /* model.frame(terms, rownames, variables, varnames, */
73 /*             dots, dotnames, subset, na.action) */
74 
modelframe(SEXP call,SEXP op,SEXP args,SEXP rho)75 SEXP modelframe(SEXP call, SEXP op, SEXP args, SEXP rho)
76 {
77     SEXP terms, data, names, variables, varnames, dots, dotnames, na_action;
78     SEXP ans, row_names, subset, tmp;
79     char buf[256];
80     int i, j, nr, nc;
81     int nvars, ndots, nactualdots;
82     const void *vmax = vmaxget();
83 
84     args = CDR(args);
85     terms = CAR(args); args = CDR(args);
86     row_names = CAR(args); args = CDR(args);
87     variables = CAR(args); args = CDR(args);
88     varnames = CAR(args); args = CDR(args);
89     dots = CAR(args); args = CDR(args);
90     dotnames = CAR(args); args = CDR(args);
91     subset = CAR(args); args = CDR(args);
92     na_action = CAR(args);
93 
94     /* Argument Sanity Checks */
95 
96     if (!isNewList(variables))
97 	error(_("invalid variables"));
98     if (!isString(varnames))
99 	error(_("invalid variable names"));
100     if ((nvars = length(variables)) != length(varnames))
101 	error(_("number of variables != number of variable names"));
102 
103     if (!isNewList(dots))
104 	error(_("invalid extra variables"));
105     if ((ndots = length(dots)) != length(dotnames))
106 	error(_("number of variables != number of variable names"));
107     if ( ndots && !isString(dotnames))
108 	error(_("invalid extra variable names"));
109 
110     /*  check for NULL extra arguments -- moved from interpreted code */
111 
112     nactualdots = 0;
113     for (i = 0; i < ndots; i++)
114 	if (VECTOR_ELT(dots, i) != R_NilValue) nactualdots++;
115 
116     /* Assemble the base data frame. */
117 
118     PROTECT(data = allocVector(VECSXP, nvars + nactualdots));
119     PROTECT(names = allocVector(STRSXP, nvars + nactualdots));
120 
121     for (i = 0; i < nvars; i++) {
122 	SET_VECTOR_ELT(data, i, VECTOR_ELT(variables, i));
123 	SET_STRING_ELT(names, i, STRING_ELT(varnames, i));
124     }
125     for (i = 0,j = 0; i < ndots; i++) {
126 	const char *ss;
127 	if (VECTOR_ELT(dots, i) == R_NilValue) continue;
128 	ss = translateChar(STRING_ELT(dotnames, i));
129 	if(strlen(ss) + 3 > 256) error(_("overlong names in '%s'"), ss);
130 	snprintf(buf, 256, "(%s)", ss);
131 	SET_VECTOR_ELT(data, nvars + j, VECTOR_ELT(dots, i));
132 	SET_STRING_ELT(names, nvars + j,  mkChar(buf));
133 	j++;
134     }
135     setAttrib(data, R_NamesSymbol, names);
136     UNPROTECT(2);
137 
138     /* Sanity checks to ensure that the the answer can become */
139     /* a data frame.  Be deeply suspicious here! */
140 
141     nc = length(data);
142     nr = 0;			/* -Wall */
143     if (nc > 0) {
144 	nr = nrows(VECTOR_ELT(data, 0));
145 	for (i = 0; i < nc; i++) {
146 	    ans = VECTOR_ELT(data, i);
147 	    switch(TYPEOF(ans)) {
148 	    case LGLSXP:
149 	    case INTSXP:
150 	    case REALSXP:
151 	    case CPLXSXP:
152 	    case STRSXP:
153 	    case RAWSXP:
154 		break;
155 	    default:
156 		error(_("invalid type (%s) for variable '%s'"),
157 		      type2char(TYPEOF(ans)),
158 		      translateChar(STRING_ELT(names, i)));
159 	    }
160 	    if (nrows(ans) != nr)
161 		error(_("variable lengths differ (found for '%s')"),
162 		      translateChar(STRING_ELT(names, i)));
163 	}
164     } else nr = length(row_names);
165 
166     PROTECT(data);
167     PROTECT(subset);
168 
169     /* Turn the data "list" into a "data.frame" */
170     /* so that subsetting methods will work. */
171     /* To do this we must attach "class"  and */
172     /* "row.names" attributes */
173 
174     PROTECT(tmp = mkString("data.frame"));
175     setAttrib(data, R_ClassSymbol, tmp);
176     UNPROTECT(1);
177     if (length(row_names) == nr) {
178 	setAttrib(data, R_RowNamesSymbol, row_names);
179     } else {
180 	/*
181 	PROTECT(row_names = allocVector(INTSXP, nr));
182 	for (i = 0; i < nr; i++) INTEGER(row_names)[i] = i+1; */
183 	PROTECT(row_names = allocVector(INTSXP, 2));
184 	INTEGER(row_names)[0] = NA_INTEGER;
185 	INTEGER(row_names)[1] = nr;
186 	setAttrib(data, R_RowNamesSymbol, row_names);
187 	UNPROTECT(1);
188     }
189 
190     /* Do the subsetting, if required. */
191     /* Need to save and restore 'most' attributes */
192 
193     if (subset != R_NilValue) {
194 	PROTECT(tmp=install("[.data.frame"));
195 	PROTECT(tmp=LCONS(tmp,list4(data,subset,R_MissingArg,mkFalse())));
196 	data = eval(tmp, rho);
197 	UNPROTECT(2);
198     }
199     UNPROTECT(2);
200     PROTECT(data);
201 
202     /* finally, we run na.action on the data frame */
203     /* usually, this will be na.omit */
204 
205     if (na_action != R_NilValue) {
206 	/* some na.actions need this to distinguish responses from
207 	   explanatory variables */
208 	setAttrib(data, install("terms"), terms);
209 	if (isString(na_action) && length(na_action) > 0)
210 	    na_action = installTrChar(STRING_ELT(na_action, 0));
211 	PROTECT(na_action);
212 	PROTECT(tmp = lang2(na_action, data));
213 	ans = eval(tmp, rho);
214 	if (MAYBE_REFERENCED(ans))
215 	    ans = shallow_duplicate(ans);
216 	PROTECT(ans);
217 	if (!isNewList(ans) || length(ans) != length(data))
218 	    error(_("invalid result from na.action"));
219 	/* need to transfer _all but tsp and dim_ attributes, possibly lost
220 	   by subsetting in na.action. */
221 	/* But if data is unchanged, don't mess with it (PR#16436) */
222 
223 	for ( i = length(ans) ; i-- ; )
224 	    if (VECTOR_ELT(data, i) != VECTOR_ELT(ans, i)) {
225 		if (MAYBE_REFERENCED(VECTOR_ELT(ans, i)))
226 		    SET_VECTOR_ELT(ans, i,
227 				   shallow_duplicate(VECTOR_ELT(ans, i)));
228 		copyMostAttribNoTs(VECTOR_ELT(data, i),VECTOR_ELT(ans, i));
229 	    }
230 
231 	UNPROTECT(3);
232     }
233     else ans = data;
234     UNPROTECT(1);
235     PROTECT(ans);
236 
237     /* Finally, tack on a terms attribute
238        Now done at R level.
239        setAttrib(ans, install("terms"), terms); */
240     UNPROTECT(1);
241     vmaxset(vmax);
242     return ans;
243 }
244 
245 
246 	/* The code below is related to model expansion */
247 	/* and is ultimately called by modelmatrix. */
248 
firstfactor(double * x,int nrx,int ncx,double * c,int nrc,int ncc,int * v,int adj)249 static void firstfactor(double *x, int nrx, int ncx,
250 			double *c, int nrc, int ncc, int *v, int adj)
251 {
252     double *cj, *xj;
253 
254     for (int j = 0; j < ncc; j++) {
255 	xj = &x[j * (R_xlen_t)nrx];
256 	cj = &c[j * (R_xlen_t)nrc];
257 	for (int i = 0; i < nrx; i++)
258 	    if(v[i] == NA_INTEGER) xj[i] = NA_REAL;
259 	    else xj[i] = cj[v[i]-1+adj];
260     }
261 }
262 
addfactor(double * x,int nrx,int ncx,double * c,int nrc,int ncc,int * v,int adj)263 static void addfactor(double *x, int nrx, int ncx,
264 		      double *c, int nrc, int ncc, int *v, int adj)
265 {
266     double *ck, *xj, *yj;
267 
268     for (int k = ncc - 1; k >= 0; k--) {
269 	for (int j = 0; j < ncx; j++) {
270 	    xj = &x[j * (R_xlen_t)nrx];
271 	    yj = &x[(k * (R_xlen_t)ncx + j)*nrx];
272 	    ck = &c[k * (R_xlen_t)nrc];
273 	    for (int i = 0; i < nrx; i++)
274 	    if(v[i] == NA_INTEGER) yj[i] = NA_REAL;
275 	    else yj[i] = ck[v[i]-1+adj] * xj[i];
276 	}
277     }
278 }
279 
firstvar(double * x,int nrx,int ncx,double * c,int nrc,int ncc)280 static void firstvar(double *x, int nrx, int ncx, double *c, int nrc, int ncc)
281 {
282     double *cj, *xj;
283 
284     for (int j = 0; j < ncc; j++) {
285 	xj = &x[j * (R_xlen_t)nrx];
286 	cj = &c[j * (R_xlen_t)nrc];
287 	for (int i = 0; i < nrx; i++)
288 	    xj[i] = cj[i];
289     }
290 }
291 
addvar(double * x,int nrx,int ncx,double * c,int nrc,int ncc)292 static void addvar(double *x, int nrx, int ncx, double *c, int nrc, int ncc)
293 {
294     double *ck, *xj, *yj;
295 
296     for (int k = ncc - 1; k >= 0; k--) {
297 	for (int j = 0; j < ncx; j++) {
298 	    xj = &x[j * (R_xlen_t)nrx];
299 	    yj = &x[(k * (R_xlen_t)ncx + j)*nrx];
300 	    ck = &c[k * (R_xlen_t)nrc];
301 	    for (int i = 0; i < nrx; i++)
302 		yj[i] = ck[i] * xj[i];
303 	}
304     }
305 }
306 
307 #define BUFSIZE 4096
308 
AppendString(char * buf,const char * str)309 static char *AppendString(char *buf, const char *str)
310 {
311     while (*str)
312 	*buf++ = *str++;
313     *buf = '\0';
314     return buf;
315 }
316 
AppendInteger(char * buf,int i)317 static char *AppendInteger(char *buf, int i)
318 {
319     sprintf(buf, "%d", i);
320     while(*buf) buf++;
321     return buf;
322 }
323 
ColumnNames(SEXP x)324 static SEXP ColumnNames(SEXP x)
325 {
326     SEXP dn = getAttrib(x, R_DimNamesSymbol);
327     if (dn == R_NilValue || length(dn) < 2)
328 	return R_NilValue;
329     else
330 	return VECTOR_ELT(dn, 1);
331 }
332 
333 // called from R as  .Externals2(C_modelmatrix, t, data)
modelmatrix(SEXP call,SEXP op,SEXP args,SEXP rho)334 SEXP modelmatrix(SEXP call, SEXP op, SEXP args, SEXP rho)
335 {
336     SEXP expr, factors, terms, vars, vnames, assign;
337     SEXP xnames, tnames, rnames;
338     SEXP count, contrast, contr1, contr2, nlevs, ordered, columns, x;
339     SEXP variable, var_i;
340     int fik, first, i, j, k, kk, ll, n, nc, nterms, nVar;
341     int intrcept, jstart, jnext, risponse, indx, rhs_response;
342     double dk, dnc;
343     char buf[BUFSIZE]="\0";
344     char *bufp;
345     const char *addp;
346     R_xlen_t nn;
347 
348     args = CDR(args);
349 
350     /* Get the "terms" structure and extract */
351     /* the intercept and response attributes. */
352     terms = CAR(args); // = 't' in R's calling code
353 
354     intrcept = asLogical(getAttrib(terms, install("intercept")));
355     if (intrcept == NA_INTEGER)
356 	intrcept = 0;
357 
358     risponse = asLogical(getAttrib(terms, install("response")));
359     if (risponse == NA_INTEGER)
360 	risponse = 0;
361 
362     /* Get the factor pattern matrix.  We duplicate this because */
363     /* we may want to alter it if we are in the no-intercept case. */
364     /* Note: the values of "nVar" and "nterms" are the REAL number of */
365     /* variables in the model data frame and the number of model terms. */
366 
367     nVar = nterms = 0;		/* -Wall */
368     PROTECT(factors = duplicate(getAttrib(terms, install("factors"))));
369     if (length(factors) == 0) {
370 	/* if (intrcept == 0)
371 	   error("invalid model (zero parameters).");*/
372 	nVar = 1;
373 	nterms = 0;
374     }
375     else if (isInteger(factors) && isMatrix(factors)) {
376 	nVar = nrows(factors);
377 	nterms = ncols(factors);
378     }
379     else error(_("invalid '%s' argument"), "terms");
380 
381     /* Get the variable names from the factor matrix */
382 
383     vnames = getAttrib(factors, R_DimNamesSymbol);
384     if (length(factors) > 0) {
385 	if (length(vnames) < 1 ||
386 	    (nVar - intrcept > 0 && !isString(VECTOR_ELT(vnames, 0))))
387 	    error(_("invalid '%s' argument"), "terms");
388 	vnames = VECTOR_ELT(vnames, 0);
389     }
390 
391     /* Get the variables from the model frame.  First perform */
392     /* elementary sanity checks.  Notes:  1) We need at least */
393     /* one variable (lhs or rhs) to compute the number of cases. */
394     /* 2) We don't type-check the response. */
395 
396     vars = CADR(args);
397     if (!isNewList(vars) || length(vars) < nVar)
398 	error(_("invalid model frame"));
399     if (length(vars) == 0)
400 	error(_("do not know how many cases"));
401 
402     nn = n = nrows(VECTOR_ELT(vars, 0));
403     /* This could be generated, so need to protect it */
404     PROTECT(rnames = getAttrib(vars, R_RowNamesSymbol));
405 
406     /* This section of the code checks the types of the variables
407        in the model frame.  Note that it should really only check
408        the variables if they appear in a term in the model.
409        Because it does not, we need to allow other types here, as they
410        might well occur on the LHS.
411        The R code converts all character variables in the model frame to
412        factors, so the only types that ought to be here are logical,
413        integer (including factor), numeric and complex.
414      */
415 
416     PROTECT(variable = allocVector(VECSXP, nVar));
417     PROTECT(nlevs = allocVector(INTSXP, nVar));
418     PROTECT(ordered = allocVector(LGLSXP, nVar));
419     PROTECT(columns = allocVector(INTSXP, nVar));
420 
421     for (i = 0; i < nVar; i++) {
422 	var_i = SET_VECTOR_ELT(variable, i, VECTOR_ELT(vars, i));
423 	if (nrows(var_i) != n)
424 	    error(_("variable lengths differ (found for variable %d)"), i);
425 	if (isOrdered_int(var_i)) {
426 	    LOGICAL(ordered)[i] = 1;
427 	    if((INTEGER(nlevs)[i] = nlevels(var_i)) < 1)
428 		error(_("variable %d has no levels"), i+1);
429 	    /* will get updated later when contrasts are set */
430 	    INTEGER(columns)[i] = ncols(var_i);
431 	}
432 	else if (isUnordered_int(var_i)) {
433 	    LOGICAL(ordered)[i] = 0;
434 	    if((INTEGER(nlevs)[i] = nlevels(var_i)) < 1)
435 		error(_("variable %d has no levels"), i+1);
436 	    /* will get updated later when contrasts are set */
437 	    INTEGER(columns)[i] = ncols(var_i);
438 	}
439 	else if (isLogical(var_i)) {
440 	    LOGICAL(ordered)[i] = 0;
441 	    INTEGER(nlevs)[i] = 2;
442 	    INTEGER(columns)[i] = ncols(var_i);
443 	}
444 	else if (isNumeric(var_i)) {
445 	    SET_VECTOR_ELT(variable, i, coerceVector(var_i, REALSXP));
446 	    var_i = VECTOR_ELT(variable, i);
447 	    LOGICAL(ordered)[i] = 0;
448 	    INTEGER(nlevs)[i] = 0;
449 	    INTEGER(columns)[i] = ncols(var_i);
450 	}
451 	else {
452 	    LOGICAL(ordered)[i] = 0;
453 	    INTEGER(nlevs)[i] = 0;
454 	    INTEGER(columns)[i] = ncols(var_i);
455 	}
456 /*	else
457 	    error(_("invalid variable type for '%s'"),
458 	    CHAR(STRING_ELT(vnames, i))); */
459     }
460 
461     /* If there is no intercept we look through the factor pattern */
462     /* matrix and adjust the code for the first factor found so that */
463     /* it will be coded by dummy variables rather than contrasts. */
464 
465     if (!intrcept) {
466 	for (j = 0; j < nterms; j++) {
467 	    for (i = risponse; i < nVar; i++) {
468 		if (INTEGER(nlevs)[i] > 1
469 		    && INTEGER(factors)[i + j * nVar] > 0) {
470 		    INTEGER(factors)[i + j * nVar] = 2;
471 		    goto alldone;
472 		}
473 	    }
474 	}
475     }
476  alldone:
477     ;
478 
479     /* Compute the required contrast or dummy variable matrices. */
480     /* We set up a symbolic expression to evaluate these, substituting */
481     /* the required arguments at call time.  The calls have the following */
482     /* form: (contrast.type nlevs contrasts) */
483 
484     PROTECT(contr1 = allocVector(VECSXP, nVar));
485     PROTECT(contr2 = allocVector(VECSXP, nVar));
486 
487     PROTECT(expr = allocList(3));
488     SET_TYPEOF(expr, LANGSXP);
489     SETCAR(expr, install("contrasts"));
490     SETCADDR(expr, allocVector(LGLSXP, 1));
491 
492     /* FIXME: We need to allow a third argument to this function */
493     /* which allows us to specify contrasts directly.  That argument */
494     /* would be used here in exactly the same way as the below. */
495     /* I.e. we would search the list of constrast specs before */
496     /* we try the evaluation below. */
497 
498     for (i = 0; i < nVar; i++) {
499 	if (INTEGER(nlevs)[i]) {
500 	    k = 0;
501 	    for (j = 0; j < nterms; j++) {
502 		if (INTEGER(factors)[i + j * nVar] == 1)
503 		    k |= 1;
504 		else if (INTEGER(factors)[i + j * nVar] == 2)
505 		    k |= 2;
506 	    }
507 	    SETCADR(expr, VECTOR_ELT(variable, i));
508 	    if (k & 1) {
509 		LOGICAL(CADDR(expr))[0] = 1;
510 		SET_VECTOR_ELT(contr1, i, eval(expr, rho));
511 	    }
512 	    if (k & 2) {
513 		LOGICAL(CADDR(expr))[0] = 0;
514 		SET_VECTOR_ELT(contr2, i, eval(expr, rho));
515 	    }
516 	}
517     }
518 
519     /* By convention, an rhs term identical to the response generates nothing
520        in the model matrix (but interactions involving the response do). */
521 
522     rhs_response = -1;
523     if (risponse > 0) /* there is a response specified */
524 	for (j = 0; j < nterms; j++)
525 	    if (INTEGER(factors)[risponse - 1 + j * nVar]) {
526 		for (i = 0, k = 0; i < nVar; i++)
527 		    k += INTEGER(factors)[i + j * nVar] > 0;
528 		if (k == 1) {
529 		    rhs_response = j;
530 		    break;
531 		}
532 	    }
533 
534 
535     /* We now have everything needed to build the design matrix. */
536     /* The first step is to compute the matrix size and to allocate it. */
537     /* Note that "count" holds a count of how many columns there are */
538     /* for each term in the model and "nc" gives the total column count. */
539 
540     PROTECT(count = allocVector(INTSXP, nterms));
541     if (intrcept)
542 	dnc = 1;
543     else
544 	dnc = 0;
545     for (j = 0; j < nterms; j++) {
546 	if (j == rhs_response) {
547 	    warning(_("the response appeared on the right-hand side and was dropped"));
548 	    INTEGER(count)[j] = 0;  /* need this initialised */
549 	    continue;
550 	}
551 	dk = 1;	/* accumulate in a double to detect overflow */
552 	for (i = 0; i < nVar; i++) {
553 	    if (INTEGER(factors)[i + j * nVar]) {
554 		if (INTEGER(nlevs)[i]) {
555 		    switch(INTEGER(factors)[i + j * nVar]) {
556 		    case 1:
557 			dk *= ncols(VECTOR_ELT(contr1, i));
558 			break;
559 		    case 2:
560 			dk *= ncols(VECTOR_ELT(contr2, i));
561 			break;
562 		    }
563 		}
564 		else dk *= INTEGER(columns)[i];
565 	    }
566 	}
567 	if (dk > INT_MAX) error(_("term %d would require %.0g columns"), j+1, dk);
568 	INTEGER(count)[j] = (int) dk;
569 	dnc = dnc + dk;
570     }
571     if (dnc > INT_MAX) error(_("matrix would require %.0g columns"), dnc);
572     nc = (int) dnc;
573 
574     /* Record which columns of the design matrix are associated */
575     /* with which model terms. */
576 
577     PROTECT(assign = allocVector(INTSXP, nc));
578     k = 0;
579     if (intrcept) INTEGER(assign)[k++] = 0;
580     for (j = 0; j < nterms; j++) {
581 	if(INTEGER(count)[j] <= 0)
582 	    warning(_("problem with term %d in model.matrix: no columns are assigned"),
583 		      j+1);
584 	for (i = 0; i < INTEGER(count)[j]; i++)
585 	    INTEGER(assign)[k++] = j+1;
586     }
587 
588 
589     /* Create column labels for the matrix columns. */
590 
591     PROTECT(xnames = allocVector(STRSXP, nc));
592 
593 
594     /* Here we loop over the terms in the model and, within each */
595     /* term, loop over the corresponding columns of the design */
596     /* matrix, assembling the names. */
597 
598     /* FIXME : The body within these two loops should be embedded */
599     /* in its own function. */
600 
601     k = 0;
602     if (intrcept)
603 	SET_STRING_ELT(xnames, k++, mkChar("(Intercept)"));
604 
605     for (j = 0; j < nterms; j++) {
606 	if (j == rhs_response) continue;
607 	for (kk = 0; kk < INTEGER(count)[j]; kk++) {
608 	    first = 1;
609 	    indx = kk;
610 	    bufp = &buf[0];
611 	    for (i = 0; i < nVar; i++) {
612 		ll = INTEGER(factors)[i + j * nVar];
613 		if (ll) {
614 		    var_i = VECTOR_ELT(variable, i);
615 		    if (!first)
616 			bufp = AppendString(bufp, ":");
617 		    first = 0;
618 		    if (isFactor(var_i) || isLogical(var_i)) {
619 			if (ll == 1) {
620 			    x = ColumnNames(VECTOR_ELT(contr1, i));
621 			    ll = ncols(VECTOR_ELT(contr1, i));
622 			}
623 			else {
624 			    x = ColumnNames(VECTOR_ELT(contr2, i));
625 			    ll = ncols(VECTOR_ELT(contr2, i));
626 			}
627 			addp = translateChar(STRING_ELT(vnames, i));
628 			if(strlen(buf) + strlen(addp) < BUFSIZE)
629 			    bufp = AppendString(bufp, addp);
630 			else
631 			    warning(_("term names will be truncated"));
632 			if (x == R_NilValue) {
633 			    if(strlen(buf) + 10 < BUFSIZE)
634 				bufp = AppendInteger(bufp, indx % ll + 1);
635 			    else
636 				warning(_("term names will be truncated"));
637 			} else {
638 			    addp = translateChar(STRING_ELT(x, indx % ll));
639 			    if(strlen(buf) + strlen(addp) < BUFSIZE)
640 				bufp = AppendString(bufp, addp);
641 			    else
642 				warning(_("term names will be truncated"));
643 			}
644 		    } else if (isComplex(var_i)) {
645 			error(_("complex variables are not currently allowed in model matrices"));
646 		    } else if(isNumeric(var_i)) { /* numeric */
647 			x = ColumnNames(var_i);
648 			ll = ncols(var_i);
649 			addp = translateChar(STRING_ELT(vnames, i));
650 			if(strlen(buf) + strlen(addp) < BUFSIZE)
651 			    bufp = AppendString(bufp, addp);
652 			else
653 			    warning(_("term names will be truncated"));
654 			if (ll > 1) {
655 			    if (x == R_NilValue) {
656 				if(strlen(buf) + 10 < BUFSIZE)
657 				    bufp = AppendInteger(bufp, indx % ll + 1);
658 				else
659 				    warning(_("term names will be truncated"));
660 			    } else {
661 				addp = translateChar(STRING_ELT(x, indx % ll));
662 				if(strlen(buf) + strlen(addp) < BUFSIZE)
663 				    bufp = AppendString(bufp, addp);
664 				else
665 				    warning(_("term names will be truncated"));
666 			    }
667 			}
668 		    } else
669 			error(_("variables of type '%s' are not allowed in model matrices"),
670 			      type2char(TYPEOF(var_i)));
671 		    indx /= ll;
672 		}
673 	    }
674 	    SET_STRING_ELT(xnames, k++, mkChar(buf));
675 	}
676     }
677 
678     /* Allocate and compute the design matrix. */
679 
680     PROTECT(x = allocMatrix(REALSXP, n, nc));
681     double *rx = REAL(x);
682 
683 #ifdef R_MEMORY_PROFILING
684     if (RTRACE(vars)){
685        memtrace_report(vars, x);
686        SET_RTRACE(x, 1);
687     }
688 #endif
689 
690     /* a) Begin with a column of 1s for the intercept. */
691 
692     if ((jnext = jstart = intrcept) != 0)
693 	for (i = 0; i < n; i++)
694 	    rx[i] = 1.0;
695 
696     /* b) Now loop over the model terms */
697 
698     contrast = R_NilValue;	/* -Wall */
699     for (k = 0; k < nterms; k++) {
700 	if (k == rhs_response) continue;
701 	for (i = 0; i < nVar; i++) {
702 	    if (INTEGER(columns)[i] == 0)
703 		continue;
704 	    var_i = VECTOR_ELT(variable, i);
705 #ifdef R_MEMORY_PROFILING
706 	    if (RTRACE(var_i)){
707 	       memtrace_report(var_i, x);
708 	       SET_RTRACE(x, 1);
709 	    }
710 #endif
711 	    fik = INTEGER(factors)[i + k * nVar];
712 	    if (fik) {
713 		switch(fik) {
714 		case 1:
715 		    contrast = coerceVector(VECTOR_ELT(contr1, i), REALSXP);
716 		    break;
717 		case 2:
718 		    contrast = coerceVector(VECTOR_ELT(contr2, i), REALSXP);
719 		    break;
720 		}
721 		PROTECT(contrast);
722 		if (jnext == jstart) {
723 		    if (INTEGER(nlevs)[i] > 0) {
724 			int adj = isLogical(var_i)?1:0;
725 			// avoid overflow of jstart * nn PR#15578
726 			firstfactor(&rx[jstart * nn], n, jnext - jstart,
727 				    REAL(contrast), nrows(contrast),
728 				    ncols(contrast), INTEGER(var_i), adj);
729 			jnext = jnext + ncols(contrast);
730 		    }
731 		    else {
732 			firstvar(&rx[jstart * nn], n, jnext - jstart,
733 				 REAL(var_i), n, ncols(var_i));
734 			jnext = jnext + ncols(var_i);
735 		    }
736 		}
737 		else {
738 		    if (INTEGER(nlevs)[i] > 0) {
739 			int adj = isLogical(var_i)?1:0;
740 			addfactor(&rx[jstart * nn], n, jnext - jstart,
741 				  REAL(contrast), nrows(contrast),
742 				  ncols(contrast), INTEGER(var_i), adj);
743 			jnext = jnext + (jnext - jstart)*(ncols(contrast) - 1);
744 		    }
745 		    else {
746 			addvar(&rx[jstart * nn], n, jnext - jstart,
747 			       REAL(var_i), n, ncols(var_i));
748 			jnext = jnext + (jnext - jstart) * (ncols(var_i) - 1);
749 		    }
750 		}
751 		UNPROTECT(1);
752 	    }
753 	}
754 	jstart = jnext;
755     }
756     PROTECT(tnames = allocVector(VECSXP, 2));
757     SET_VECTOR_ELT(tnames, 0, rnames);
758     SET_VECTOR_ELT(tnames, 1, xnames);
759     setAttrib(x, R_DimNamesSymbol, tnames);
760     setAttrib(x, install("assign"), assign);
761     UNPROTECT(14);
762     return x;
763 }
764 // modelmatrix()
765 
766 
767 /* updateform() :  Update a model formula by the replacement of "." templates.
768    ---------------------------------------------------------------------------
769  */
770 static SEXP tildeSymbol = NULL;
771 static SEXP plusSymbol  = NULL;
772 static SEXP minusSymbol = NULL;
773 static SEXP timesSymbol = NULL;
774 static SEXP slashSymbol = NULL;
775 static SEXP colonSymbol = NULL;
776 static SEXP powerSymbol = NULL;
777 static SEXP dotSymbol   = NULL;
778 static SEXP parenSymbol = NULL;
779 static SEXP inSymbol    = NULL;
780 
ExpandDots(SEXP object,SEXP value)781 static SEXP ExpandDots(SEXP object, SEXP value)
782 {
783     SEXP op;
784 
785     if (TYPEOF(object) == SYMSXP) {
786 	if (object == dotSymbol)
787 	    object = duplicate(value);
788 	return object;
789     }
790 
791     if (TYPEOF(object) == LANGSXP) {
792 	if (TYPEOF(value) == LANGSXP) op = CAR(value);
793 	else op = NULL;
794 	PROTECT(object);
795 	if (CAR(object) == plusSymbol) {
796 	    if (length(object) == 2) {
797 		SETCADR(object, ExpandDots(CADR(object), value));
798 	    }
799 	    else if (length(object) == 3) {
800 		SETCADR(object, ExpandDots(CADR(object), value));
801 		SETCADDR(object, ExpandDots(CADDR(object), value));
802 	    }
803 	    else goto badformula;
804 	}
805 	else if (CAR(object) == minusSymbol) {
806 	    if (length(object) == 2) {
807 		if (CADR(object) == dotSymbol &&
808 		   (op == plusSymbol || op == minusSymbol))
809 		    SETCADR(object, lang2(parenSymbol,
810 					  ExpandDots(CADR(object), value)));
811 		else
812 		    SETCADR(object, ExpandDots(CADR(object), value));
813 	    }
814 	    else if (length(object) == 3) {
815 		if (CADR(object) == dotSymbol &&
816 		   (op == plusSymbol || op == minusSymbol))
817 		    SETCADR(object, lang2(parenSymbol,
818 					  ExpandDots(CADR(object), value)));
819 		else
820 		    SETCADR(object, ExpandDots(CADR(object), value));
821 		if (CADDR(object) == dotSymbol &&
822 		   (op == plusSymbol || op == minusSymbol))
823 		    SETCADDR(object, lang2(parenSymbol,
824 					   ExpandDots(CADDR(object), value)));
825 		else
826 		    SETCADDR(object, ExpandDots(CADDR(object), value));
827 	    }
828 	    else goto badformula;
829 	}
830 	else if (CAR(object) == timesSymbol || CAR(object) == slashSymbol) {
831 	    if (length(object) != 3)
832 		goto badformula;
833 	    if (CADR(object) == dotSymbol &&
834 	       (op == plusSymbol || op == minusSymbol))
835 		SETCADR(object, lang2(parenSymbol,
836 				      ExpandDots(CADR(object), value)));
837 	    else
838 		SETCADR(object, ExpandDots(CADR(object), value));
839 	    if (CADDR(object) == dotSymbol &&
840 	       (op == plusSymbol || op == minusSymbol))
841 		SETCADDR(object, lang2(parenSymbol,
842 				       ExpandDots(CADDR(object), value)));
843 	    else
844 		SETCADDR(object, ExpandDots(CADDR(object), value));
845 	}
846 	else if (CAR(object) == colonSymbol) {
847 	    if (length(object) != 3)
848 		goto badformula;
849 	    if (CADR(object) == dotSymbol &&
850 	       (op == plusSymbol || op == minusSymbol ||
851 		op == timesSymbol || op == slashSymbol))
852 		SETCADR(object, lang2(parenSymbol,
853 				      ExpandDots(CADR(object), value)));
854 	    else
855 		SETCADR(object, ExpandDots(CADR(object), value));
856 	    if (CADDR(object) == dotSymbol &&
857 	       (op == plusSymbol || op == minusSymbol))
858 		SETCADDR(object, lang2(parenSymbol,
859 				       ExpandDots(CADDR(object), value)));
860 	    else
861 		SETCADDR(object, ExpandDots(CADDR(object), value));
862 	}
863 	else if (CAR(object) == powerSymbol) {
864 	    if (length(object) != 3)
865 		goto badformula;
866 	    if (CADR(object) == dotSymbol &&
867 	       (op == plusSymbol || op == minusSymbol ||
868 		op == timesSymbol || op == slashSymbol ||
869 		op == colonSymbol))
870 		SETCADR(object, lang2(parenSymbol,
871 				      ExpandDots(CADR(object), value)));
872 	    else
873 		SETCADR(object, ExpandDots(CADR(object), value));
874 	    if (CADDR(object) == dotSymbol &&
875 	       (op == plusSymbol || op == minusSymbol))
876 		SETCADDR(object, lang2(parenSymbol,
877 				       ExpandDots(CADDR(object), value)));
878 	    else
879 		SETCADDR(object, ExpandDots(CADDR(object), value));
880 	}
881 	else {
882 	    op = object;
883 	    while(op != R_NilValue) {
884 		SETCAR(op, ExpandDots(CAR(op), value));
885 		op = CDR(op);
886 	    }
887 	}
888 	UNPROTECT(1);
889 	return object;
890     }
891     else return object;
892 
893  badformula:
894     error(_("invalid formula in 'update'"));
895     return R_NilValue; /*NOTREACHED*/
896 }
897 
updateform(SEXP old,SEXP new)898 SEXP updateform(SEXP old, SEXP new)
899 {
900     SEXP _new;
901 
902     /* Always fetch these values rather than trying */
903     /* to remember them between calls.  The overhead */
904     /* is minimal and we don't have to worry about */
905     /* intervening dump/restore problems. */
906 
907     tildeSymbol = install("~");
908     plusSymbol  = install("+");
909     minusSymbol = install("-");
910     timesSymbol = install("*");
911     slashSymbol = install("/");
912     colonSymbol = install(":");
913     powerSymbol = install("^");
914     dotSymbol   = install(".");
915     parenSymbol = install("(");
916     inSymbol = install("%in%");
917 
918     /* We must duplicate here because the */
919     /* formulae may be part of the parse tree */
920     /* and we don't want to modify it. */
921 
922     PROTECT(_new = duplicate(new));
923 
924     /* Check of new and old formulae. */
925     if (TYPEOF(old) != LANGSXP ||
926        (TYPEOF(_new) != LANGSXP && CAR(old) != tildeSymbol) ||
927        CAR(_new) != tildeSymbol)
928 	error(_("formula expected"));
929 
930     if (length(old) == 3) {
931 	SEXP lhs = CADR(old);
932 	SEXP rhs = CADDR(old);
933 	/* We now check that new formula has a valid lhs.
934 	   If it doesn't, we add one and set it to the rhs of the old
935 	   formula. */
936 	if (length(_new) == 2)
937 	    SETCDR(_new, CONS(lhs, CDR(_new)));
938 	/* Now we check the left and right sides of the new formula
939 	   and substitute the correct value for any "." templates.
940 	   We must parenthesize the rhs or we might upset arity and
941 	   precedence. */
942 	PROTECT(rhs);
943 	SETCADR(_new, ExpandDots(CADR(_new), lhs));
944 	SETCADDR(_new, ExpandDots(CADDR(_new), rhs));
945 	UNPROTECT(1);
946     }
947     else {
948 	/* The old formula had no lhs, so we only expand the rhs of the
949 	   new formula. */
950 	SEXP rhs = CADR(old);
951 	if (length(_new) == 3)
952 	    SETCADDR(_new, ExpandDots(CADDR(_new), rhs));
953 	else
954 	    SETCADR(_new, ExpandDots(CADR(_new), rhs));
955     }
956 
957     /* It might be overkill to zero the */
958     /* the attribute list of the returned */
959     /* value, but it can't hurt. */
960 
961     SET_ATTRIB(_new, R_NilValue);
962     SET_OBJECT(_new, 0);
963     SEXP DotEnvSymbol = install(".Environment");
964     setAttrib(_new, DotEnvSymbol, getAttrib(old, DotEnvSymbol));
965 
966     UNPROTECT(1);
967     return _new;
968 }
969 
970 
971 /*==========================================================================*/
972 
973 /* termsform() & auxiliaries: Workhorse to turn model formula into terms() object
974    ------------------------------------------------------------------------------
975  */
976 #ifdef DEBUG_terms
977 # include <Print.h>
978 /* can use  printVector(SEXP x, int indx, int quote)
979  *	    ~~~~~~~~~~~		    ^^^^      ^^^^^
980  * to print R vector x[]; if(indx) print indices; if(quote) quote strings
981  * Print a "pair"List (with all *vector* components): */
printPList(SEXP form)982 static void printPList(SEXP form)
983 {
984     R_xlen_t n = 0;
985     SEXP cl;
986     for (cl = form, n = 0; cl != R_NilValue; cl = CDR(cl), n++) {
987 	Rprintf(" L[%d] (|L[.]|=%d): ", n+1, length(CAR(cl)));
988 	printVector(CAR(cl), 0, 0);
989     }
990     return;
991 }
992 static Rboolean trace_GetBit = TRUE;
993 static Rboolean trace_InstallVar = TRUE;
994 static int n_xVars; // nesting level: use for indentation
995 #endif
996 
997 // word size in *bits* :
998 #define WORDSIZE (8*sizeof(int))
999 
1000 //  Global "State" Variables for terms() computation :
1001 //  -------------------------------------------------
1002 
1003 static int // 0/1 (Boolean) :
1004     intercept,		//  1: have intercept term in the model
1005     parity,		//  +/- parity
1006     response;		//  1: response term in the model
1007 static int nwords;		/* # of words (ints) to code a term */
1008 static SEXP varlist;		/* variables in the model */
1009 static PROTECT_INDEX vpi;
1010 static SEXP framenames;		/* variables names for specified frame */
1011 static Rboolean haveDot;	/* does RHS of formula contain `.'? */
1012 
isZeroOne(SEXP x)1013 static int isZeroOne(SEXP x)
1014 {
1015     if (!isNumeric(x)) return 0;
1016     return (asReal(x) == 0.0 || asReal(x) == 1.0);
1017 }
1018 
isZero(SEXP x)1019 static int isZero(SEXP x)
1020 {
1021     if (!isNumeric(x)) return 0;
1022     return asReal(x) == 0.0;
1023 }
1024 
isOne(SEXP x)1025 static int isOne(SEXP x)
1026 {
1027     if (!isNumeric(x)) return 0;
1028     return asReal(x) == 1.0;
1029 }
1030 
1031 
1032 /* MatchVar determines whether two ``variables'' are */
1033 /* identical.  Expressions are identical if they have */
1034 /* the same list structure and their atoms are identical. */
1035 /* This is just EQUAL from lisp. */
1036 
1037 /* See src/main/memory.c: probably could be simplified to pointer comparison */
Seql2(SEXP a,SEXP b)1038 static int Seql2(SEXP a, SEXP b)
1039 {
1040     if (a == b) return 1;
1041     if (IS_CACHED(a) && IS_CACHED(b) && ENC_KNOWN(a) == ENC_KNOWN(b))
1042 	return 0;
1043     else {
1044     	const void *vmax = vmaxget();
1045     	int result = !strcmp(translateCharUTF8(a), translateCharUTF8(b));
1046     	vmaxset(vmax); /* discard any memory used by translateCharUTF8 */
1047     	return result;
1048     }
1049 }
1050 
MatchVar(SEXP var1,SEXP var2)1051 static int MatchVar(SEXP var1, SEXP var2)
1052 {
1053     /* For expedience, and sanity... */
1054     if ( var1 == var2 )
1055 	return 1;
1056     /* Handle Nulls */
1057     if (isNull(var1) && isNull(var2))
1058 	return 1;
1059     if (isNull(var1) || isNull(var2))
1060 	return 0;
1061     /* Non-atomic objects - compare CARs & CDRs (and TAGs:  PR#17235) */
1062     if ((isList(var1) || isLanguage(var1)) &&
1063 	(isList(var2) || isLanguage(var2)))
1064 	return MatchVar(CAR(var1), CAR(var2)) &&
1065 	       MatchVar(CDR(var1), CDR(var2)) &&
1066 	       MatchVar(TAG(var1), TAG(var2));
1067     /* Symbols */
1068     if (isSymbol(var1) && isSymbol(var2))
1069 	return (var1 == var2);
1070     /* Literal Numerics */
1071     if (isNumeric(var1) && isNumeric(var2))
1072 	return (asReal(var1) == asReal(var2));
1073     /* Literal Strings */
1074     if (isString(var1) && isString(var2))
1075 	return Seql2(STRING_ELT(var1, 0), STRING_ELT(var2, 0));
1076     /* Nothing else matches */
1077     return 0;
1078 }
1079 
1080 
1081 /* InstallVar locates a ''variable'' in the model variable list
1082    adding it to the global varlist if not found. */
1083 
1084 /* FIXME?  This is used in two "almost orthogonal" situations:
1085  * 1) Install variable in varlist -- index is *not* used by caller:
1086  *    InstallVar(.) called by ExtractVars() always w/o assigning result.
1087  * 2) Find the index of 'var' in 'varlist' (but do not change varlist,
1088  *    as now *all* variables should have been assigned by ExtractVars());
1089  *    called as  'whichBit = InstallVar(.)' *only* by AllocTermSetBit1()
1090  */
InstallVar(SEXP var)1091 static int InstallVar(SEXP var)
1092 {
1093     SEXP v;
1094     /* Check that variable is legitimate */
1095     if (!isSymbol(var) && !isLanguage(var) && !isZeroOne(var))
1096 	error(_("invalid term in model formula"));
1097     /* Lookup/Install it */
1098     int indx = 0;
1099     for (v = varlist; CDR(v) != R_NilValue; v = CDR(v)) {
1100 	indx++;
1101 	if (MatchVar(var, CADR(v)))
1102 	    return indx;
1103     }
1104 #ifdef DEBUG_terms
1105     if(trace_InstallVar)
1106 	Rprintf("InstallVar(%s): adding it to 'varlist' and return indx+1 = %d\n",
1107 		CHAR(STRING_ELT(deparse1line(var, 0), 0)), indx + 1);
1108 #endif
1109     SETCDR(v, CONS(var, R_NilValue));
1110     return indx + 1;
1111 }
1112 
1113 
1114 /* If there is a dotsxp being expanded then we need to see
1115    whether any of the variables in the data frame match with
1116    the variable on the lhs. If so they shouldn't be included
1117    in the factors */
1118 
CheckRHS(SEXP v)1119 static void CheckRHS(SEXP v)
1120 {
1121     while ((isList(v) || isLanguage(v)) && v != R_NilValue) {
1122 	CheckRHS(CAR(v));
1123 	v = CDR(v);
1124     }
1125     if (isSymbol(v)) {
1126 	for (int i = 0; i < length(framenames); i++) {
1127 	    SEXP s = installTrChar(STRING_ELT(framenames, i));
1128 	    if (v == s) {
1129 		SEXP t = allocVector(STRSXP, length(framenames) - 1);
1130 		for (int j = 0; j < length(t); j++) {
1131 		    if (j < i)
1132 			SET_STRING_ELT(t, j, STRING_ELT(framenames, j));
1133 		    else
1134 			SET_STRING_ELT(t, j, STRING_ELT(framenames, j+1));
1135 		}
1136 		REPROTECT(framenames = t, vpi);
1137 	    }
1138 	}
1139     }
1140 }
1141 
1142 
1143 /* ExtractVars() recursively extracts the variables in a model formula.
1144  * ------------- It calls InstallVar() to do the installation.
1145  * The code takes care of unary  '+' and '-' .  No checks are made
1146  * of the other ``binary'' operators.  Maybe there should be some.
1147 
1148  * NB: logic here is partly "repeated" in   EncodeVars()  --->> keep in sync !!
1149 */
ExtractVars(SEXP formula)1150 static void ExtractVars(SEXP formula)
1151 {
1152 #ifdef DEBUG_terms
1153     // for(int j=0; j < n_xVars; j++) Rprintf("  ");
1154     // indentation by (2 * n_xVars)
1155     Rprintf("%*d ExtractVars(%s): ", 2*n_xVars, n_xVars,
1156 	    CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1157     n_xVars++; // nesting level: use for indentation (by 2):
1158     int my_n_x = n_xVars; // so we can restore n_xVars before leaving
1159 
1160 #  define Prt_xtrVars(_S_) Rprintf(" case '%s'\n", _S_)
1161 #  define Return  n_xVars = my_n_x; return
1162 #else
1163 #  define Prt_xtrVars(_S_)
1164 #  define Return  return
1165 #endif
1166 
1167     if (isNull(formula) || isZeroOne(formula)) {
1168 	Prt_xtrVars("Null or ZeroOne");
1169 	Return;
1170     }
1171     if (isSymbol(formula)) {
1172 	if (formula == dotSymbol) haveDot = TRUE;
1173 	Prt_xtrVars(haveDot ? "isSym(<Dot>)" : "isSymbol()");
1174 	    if (haveDot && framenames != R_NilValue) {
1175 		// install variables of the data frame:
1176 		for (int i = 0; i < length(framenames); i++) {
1177 		    SEXP v = installTrChar(STRING_ELT(framenames, i));
1178 		    if (!MatchVar(v, CADR(varlist))) InstallVar(v);
1179 		}
1180 	    } else
1181 		InstallVar(formula);
1182 	Return;
1183     }
1184     if (isLanguage(formula)) {
1185 	if (CAR(formula) == tildeSymbol) {
1186 	    if (response)
1187 		error(_("invalid model formula")); // more than one '~'
1188 	    if (isNull(CDDR(formula))) {
1189 		Prt_xtrVars("isLanguage, tilde, *not* response");
1190 		response = 0;
1191 		ExtractVars(CADR(formula));
1192 	    }
1193 	    else {
1194 		Prt_xtrVars("isLanguage, tilde, *response*");
1195 		response = 1;
1196 		InstallVar(CADR(formula));
1197 		ExtractVars(CADDR(formula));
1198 	    }
1199 	    Return;
1200 	}
1201 	if (CAR(formula) == plusSymbol) { // '+'
1202 	    int len = length(formula);
1203 	    Prt_xtrVars("isLanguage, '+'");
1204 	    if (len > 1)
1205 		ExtractVars(CADR(formula));
1206 	    if (len > 2) // binary
1207 		ExtractVars(CADDR(formula));
1208 	    Return;
1209 	}
1210 	if (CAR(formula) == colonSymbol) {
1211 	    Prt_xtrVars("isLanguage, ':'");
1212 	    ExtractVars(CADR(formula));
1213 	    ExtractVars(CADDR(formula));
1214 	    Return;
1215 	}
1216 	if (CAR(formula) == powerSymbol) {
1217 	    Prt_xtrVars("isLanguage, '^'");
1218 	    if (!isNumeric(CADDR(formula)))
1219 		error(_("invalid power in formula"));
1220 	    ExtractVars(CADR(formula));
1221 	    Return;
1222 	}
1223 	if (CAR(formula) == timesSymbol) {
1224 	    Prt_xtrVars("isLanguage, '*'");
1225 	    ExtractVars(CADR(formula));
1226 	    ExtractVars(CADDR(formula));
1227 	    Return;
1228 	}
1229 	if (CAR(formula) == inSymbol) {
1230 	    Prt_xtrVars("isLanguage, '%in%'");
1231 	    ExtractVars(CADR(formula));
1232 	    ExtractVars(CADDR(formula));
1233 	    Return;
1234 	}
1235 	if (CAR(formula) == slashSymbol) {
1236 	    Prt_xtrVars("isLanguage, '/'");
1237 	    ExtractVars(CADR(formula));
1238 	    ExtractVars(CADDR(formula));
1239 	    Return;
1240 	}
1241 	if (CAR(formula) == minusSymbol) { // '-'
1242 	    Prt_xtrVars("isLanguage, '-'");
1243 	    int len = length(formula);
1244 	    if (len == 2) { // unary
1245 		ExtractVars(CADR(formula));
1246 	    }
1247 	    else {
1248 		ExtractVars(CADR(formula));
1249 		ExtractVars(CADDR(formula));
1250 	    }
1251 	    Return;
1252 	}
1253 	if (CAR(formula) == parenSymbol) {
1254 	    Prt_xtrVars("isLanguage, '('");
1255 	    ExtractVars(CADR(formula));
1256 	    Return;
1257 	}
1258 	// all other calls:
1259 	Prt_xtrVars("_otherwise_");
1260 #ifdef DEBUG_terms
1261 	Rprintf(" .. {%d} ExtractVars(f): installing formula f='%s'\n",
1262 		n_xVars,
1263 		CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1264 #endif
1265 	InstallVar(formula);
1266 	Return;
1267     }
1268     error(_("invalid model formula in ExtractVars"));
1269 }
1270 
1271 
1272 /* AllocTerm allocates an integer array for bit string representation
1273  * of a model term and initializes to __no bit set__ */
1274 
AllocTerm(void)1275 static SEXP AllocTerm(void) // (<global> nwords)
1276 {
1277     SEXP term = allocVector(INTSXP, nwords); // caller must PROTECT
1278     memset(INTEGER(term), 0, nwords * sizeof(int));
1279     return term;
1280 }
1281 
1282 
1283 /* SetBit sets bit ``whichBit'' to value ``value''
1284    in the bit string representation of a term. */
1285 
SetBit(SEXP term,int whichBit,int value)1286 static void SetBit(SEXP term, int whichBit, int value)
1287 {
1288     int
1289 	word = (whichBit - 1) / WORDSIZE,
1290 	offset = (- whichBit) % WORDSIZE;
1291 #ifdef DEBUG_terms
1292     Rprintf("SetBit(term, which=%3d, value=%d): |term|=%d, word= %d, offset= %2d\n",
1293 	    whichBit, value, length(term), word, offset);
1294 #endif
1295     if (value)
1296 	((unsigned *) INTEGER(term))[word] |=  ((unsigned) 1 << offset);
1297     else
1298 	((unsigned *) INTEGER(term))[word] &= ~((unsigned) 1 << offset);
1299 }
1300 
1301 /* 0.   Install var (if needed)
1302  * 1a.  Check if nwords is large enough for 'whichBit'
1303  * 1b.  If not, increment nwords
1304  * 2.  term = AllocTerm();
1305  * 3.  SetBit(term, whichBit, 1);
1306  */
AllocTermSetBit1(SEXP var)1307 static SEXP AllocTermSetBit1(SEXP var) { // NB: caller must PROTECT
1308     int whichBit = InstallVar(var);
1309     if (nwords < (whichBit - 1)/WORDSIZE + 1)
1310 	error("AllocT..Bit1(%s): Need to increment nwords to %d. Should not happen!\n",
1311 		CHAR(STRING_ELT(deparse1line(var, 0), 0)),
1312 		nwords+1);
1313     SEXP term = AllocTerm();
1314     SetBit(term, whichBit, 1);
1315     return term;
1316 }
1317 
1318 
1319 /* GetBit gets bit ``whichBit'' from the */
1320 /* bit string representation of a term. */
1321 
GetBit(SEXP term,int whichBit)1322 static int GetBit(SEXP term, int whichBit)
1323 {
1324     int
1325 	word = (whichBit - 1) / WORDSIZE,
1326 	offset = (- whichBit) % WORDSIZE,
1327 	word_off = INTEGER(term)[word] >> offset;
1328 #ifdef DEBUG_terms
1329     if(trace_GetBit) {
1330 	Rprintf("GetBit(term,%3d): |term|=%2d, word=%d, offset= %2d --> bit= %d\n",
1331 		whichBit, length(term), word, offset,
1332 		word_off & 1);
1333     }
1334 #endif
1335     return word_off & 1;
1336 }
1337 
1338 
1339 /* OrBits computes a new (bit string) term */
1340 /* which contains the logical OR of the bits */
1341 /* in ``term1'' and ``term2''. */
1342 
OrBits(SEXP term1,SEXP term2)1343 static SEXP OrBits(SEXP term1, SEXP term2)
1344 {
1345     SEXP term = AllocTerm();
1346     for (int i = 0; i < nwords; i++)
1347 	INTEGER(term)[i] = INTEGER(term1)[i] | INTEGER(term2)[i];
1348     return term;
1349 }
1350 
1351 
1352 // BitCount counts the number of ``on'' bits in a term
BitCount(SEXP term,int nvar)1353 static int BitCount(SEXP term, int nvar)
1354 {
1355     int sum = 0;
1356 #ifdef DEBUG_terms
1357     Rprintf("BitCount(*, nvar=%d): ", nvar);
1358 #endif
1359     for (int i = 1; i <= nvar; i++)
1360 	sum += GetBit(term, i);
1361 #ifdef DEBUG_terms
1362     Rprintf("  end{BitCount}\n");
1363 #endif
1364     return sum;
1365 }
1366 
1367 
1368 /* TermZero tests whether a (bit string) term is zero */
1369 
TermZero(SEXP term)1370 static int TermZero(SEXP term)
1371 {
1372     for (int i = 0; i < nwords; i++)
1373         if (INTEGER(term)[i] != 0)
1374 	    return 0;
1375     return 1;
1376 }
1377 
1378 
1379 /* TermEqual tests two (bit string) terms for equality. */
1380 
TermEqual(SEXP term1,SEXP term2)1381 static int TermEqual(SEXP term1, SEXP term2)
1382 {
1383     for (int i = 0; i < nwords; i++)
1384         if (INTEGER(term1)[i] != INTEGER(term2)[i])
1385             return 0;
1386     return 1;
1387 }
1388 
1389 
1390 /* StripTerm strips the specified term from the given list.
1391    This mutates the list (but the caller replaces it by 'root').
1392    Only called from  DeleteTerms() i.e.,  " left - right "
1393 */
1394 
StripTerm(SEXP term,SEXP list)1395 static SEXP StripTerm(SEXP term, SEXP list)
1396 {
1397     SEXP root = R_NilValue, prev = R_NilValue;
1398     if (TermZero(term))
1399 	intercept = 0;
1400     while (list != R_NilValue) {
1401 	if (TermEqual(term, CAR(list))) {
1402 	    if (prev != R_NilValue)
1403 		SETCDR(prev, CDR(list));
1404 	} else {
1405 	    if (root == R_NilValue)
1406 		root = list;
1407 	    prev = list;
1408 	}
1409 	list = CDR(list);
1410     }
1411     return root;
1412 }
1413 
1414 
1415 /* TrimRepeats removes duplicates of (bit string) terms in a model formula.
1416    Also drops zero terms. */
1417 
TrimRepeats(SEXP list)1418 static SEXP TrimRepeats(SEXP list)
1419 {
1420     // Drop zero terms at the start of the list.
1421     while (list != R_NilValue && TermZero(CAR(list))) {
1422 	list = CDR(list);
1423     }
1424     if (list == R_NilValue || CDR(list) == R_NilValue)
1425 	return list;
1426 
1427     PROTECT(list);
1428     // Find out which terms are duplicates.
1429     SEXP all_terms = PROTECT(PairToVectorList(list)),
1430 	duplicate_sexp = PROTECT(duplicated(all_terms, FALSE));
1431     int i_p1 = 1, *is_duplicate = LOGICAL(duplicate_sexp);
1432 
1433     // Remove the zero terms and duplicates from the list.
1434     for (SEXP current = list; CDR(current) != R_NilValue; i_p1++) {
1435 	SEXP next = CDR(current);
1436 	if (is_duplicate[i_p1] || TermZero(CAR(next))) {
1437 	    // Remove the node from the list.
1438 	    SETCDR(current, CDR(next));
1439 	} else {
1440 	    current = next;
1441 	}
1442     }
1443 
1444     UNPROTECT(3);
1445     return list;
1446 }
1447 
1448 
1449 
1450 /*==========================================================================*/
1451 
1452 /* Model Formula Manipulation */
1453 /* These functions take a numerically coded */
1454 /* formula and fully expand it. */
1455 
1456 static SEXP EncodeVars(SEXP);/* defined below */
1457 
1458 
1459 /* PlusTerms expands ``left'' and ``right'' and */
1460 /* concatenates their terms (removing duplicates). */
1461 
PlusTerms(SEXP left,SEXP right)1462 static SEXP PlusTerms(SEXP left, SEXP right)
1463 {
1464     PROTECT(left = EncodeVars(left));
1465     right = EncodeVars(right);
1466     UNPROTECT(1);
1467     return TrimRepeats(listAppend(left, right));
1468 }
1469 
1470 
1471 /* InteractTerms expands ``left'' and ``right'' */
1472 /* and forms a new list of terms containing the bitwise */
1473 /* OR of each term in ``left'' with each term in ``right''. */
1474 
InteractTerms(SEXP left,SEXP right)1475 static SEXP InteractTerms(SEXP left, SEXP right)
1476 {
1477     SEXP term, l, r, t;
1478     PROTECT(left = EncodeVars(left));
1479     PROTECT(right = EncodeVars(right));
1480     PROTECT(term = allocList(length(left) * length(right)));
1481     t = term;
1482     for (l = left; l != R_NilValue; l = CDR(l))
1483 	for (r = right; r != R_NilValue; r = CDR(r)) {
1484 	    SETCAR(t, OrBits(CAR(l), CAR(r)));
1485 	    t = CDR(t);
1486 	}
1487     UNPROTECT(3);
1488     return TrimRepeats(term);
1489 }
1490 
1491 
1492 /* CrossTerms expands ``left'' and ``right'' */
1493 /* and forms the ``cross'' of the list of terms.  */
1494 /* Duplicates are removed. */
1495 
CrossTerms(SEXP left,SEXP right)1496 static SEXP CrossTerms(SEXP left, SEXP right)
1497 {
1498     SEXP term, l, r, t;
1499     PROTECT(left = EncodeVars(left));
1500     PROTECT(right = EncodeVars(right));
1501     PROTECT(term = allocList(length(left) * length(right)));
1502     t = term;
1503     for (l = left; l != R_NilValue; l = CDR(l))
1504 	for (r = right; r != R_NilValue; r = CDR(r)) {
1505 	    SETCAR(t, OrBits(CAR(l), CAR(r)));
1506 	    t = CDR(t);
1507 	}
1508     UNPROTECT(3);
1509     listAppend(right, term);
1510     listAppend(left, right);
1511     return TrimRepeats(left);
1512 }
1513 
1514 
1515 /* PowerTerms expands the ``left'' form and then */
1516 /* raises it to the power specified by the right term. */
1517 /* Allocation here is wasteful, but so what ... */
1518 
PowerTerms(SEXP left,SEXP right)1519 static SEXP PowerTerms(SEXP left, SEXP right)
1520 {
1521     SEXP term, l, r, t;
1522     int i, ip;
1523     ip = asInteger(right);
1524     if (ip==NA_INTEGER || ip <= 1)
1525 	error(_("invalid power in formula"));
1526     term = R_NilValue;		/* -Wall */
1527     PROTECT(left = EncodeVars(left));
1528     right = left;
1529     for (i=1; i < ip; i++)  {
1530 	PROTECT(right);
1531 	PROTECT(term = allocList(length(left) * length(right)));
1532 	t = term;
1533 	for (l = left; l != R_NilValue; l = CDR(l))
1534 	    for (r = right; r != R_NilValue; r = CDR(r)) {
1535 		SETCAR(t, OrBits(CAR(l), CAR(r)));
1536 		t = CDR(t);
1537 	    }
1538 	UNPROTECT(2);
1539 	right = TrimRepeats(term);
1540     }
1541     UNPROTECT(1);
1542     return term;
1543 }
1544 
1545 
1546 /* InTerms expands ``left'' and ``right'' and */
1547 /* forms the ``nest'' of the the left in the */
1548 /* interaction of the right */
1549 
InTerms(SEXP left,SEXP right)1550 static SEXP InTerms(SEXP left, SEXP right)
1551 {
1552     PROTECT(left = EncodeVars(left));
1553     PROTECT(right = EncodeVars(right));
1554     SEXP t, term = PROTECT(AllocTerm());
1555     int *term_ = INTEGER(term);
1556     /* Bitwise or of all terms on right */
1557     for (t = right; t != R_NilValue; t = CDR(t)) {
1558 	for (int i = 0; i < nwords; i++)
1559 	    term_[i] = term_[i] | INTEGER(CAR(t))[i];
1560     }
1561     /* Now bitwise or with each term on the left */
1562     for (t = left; t != R_NilValue; t = CDR(t))
1563 	for (int i = 0; i < nwords; i++)
1564 	    INTEGER(CAR(t))[i] = term_[i] | INTEGER(CAR(t))[i];
1565     UNPROTECT(3);
1566     return TrimRepeats(left);
1567 }
1568 
1569 /* NestTerms expands ``left'' and ``right'' */
1570 /* and forms the ``nest'' of the list of terms.  */
1571 /* Duplicates are removed. */
1572 
NestTerms(SEXP left,SEXP right)1573 static SEXP NestTerms(SEXP left, SEXP right)
1574 {
1575     PROTECT(left  = EncodeVars(left));
1576     PROTECT(right = EncodeVars(right));
1577     SEXP t, term = PROTECT(AllocTerm());
1578     int *term_ = INTEGER(term);
1579     /* Bitwise or of all terms on left */
1580     for (t = left; t != R_NilValue; t = CDR(t)) {
1581 	for (int i = 0; i < nwords; i++)
1582 	    term_[i] = term_[i] | INTEGER(CAR(t))[i];
1583     }
1584     /* Now bitwise or with each term on the right */
1585     for (t = right; t != R_NilValue; t = CDR(t))
1586 	for (int i = 0; i < nwords; i++)
1587 	    INTEGER(CAR(t))[i] = term_[i] | INTEGER(CAR(t))[i];
1588     UNPROTECT(3);
1589     listAppend(left, right);
1590     return TrimRepeats(left);
1591 }
1592 
1593 
1594 /* DeleteTerms expands ``left'' and ``right'' */
1595 /* and then removes any terms which appear in */
1596 /* ``right'' from ``left''. */
1597 
DeleteTerms(SEXP left,SEXP right)1598 static SEXP DeleteTerms(SEXP left, SEXP right)
1599 {
1600     PROTECT(left  = EncodeVars(left));	parity = 1-parity;
1601     PROTECT(right = EncodeVars(right)); parity = 1-parity;
1602     for (SEXP t = right; t != R_NilValue; t = CDR(t))
1603 	left = StripTerm(CAR(t), left);
1604     UNPROTECT(2);
1605     return left;
1606 }
1607 
1608 
1609 /* EncodeVars() performs  model expansion and bit string encoding.
1610  * This is the real workhorse of model expansion in terms.formula() == termsform()
1611  *
1612  * Returns a (pair)list of length 'nterm' containing integer vectors [1:nwords],
1613  */
EncodeVars(SEXP formula)1614 static SEXP EncodeVars(SEXP formula)
1615 {
1616     if (isNull(formula))		return R_NilValue;
1617     else if (isOne(formula)) {
1618 	intercept = (parity) ? 1 : 0;	return R_NilValue;
1619     }
1620     else if (isZero(formula)) {
1621 	intercept = (parity) ? 0 : 1;	return R_NilValue;
1622     }
1623     // else :
1624     SEXP term;
1625     if (isSymbol(formula)) {
1626 	if (formula == dotSymbol && framenames != R_NilValue) {
1627 	    /* prior to 1.7.0 this made term.labels in reverse order. */
1628 	    SEXP r = R_NilValue, v = R_NilValue; /* -Wall */
1629 	    if (!LENGTH(framenames)) return r;
1630 	    const void *vmax = vmaxget();
1631 	    for (int i = 0; i < LENGTH(framenames); i++) {
1632 		/* change in 1.6.0 do not use duplicated names */
1633 		const char *c = translateChar(STRING_ELT(framenames, i));
1634 		for(int j = 0; j < i; j++)
1635 		    if(!strcmp(c, translateChar(STRING_ELT(framenames, j))))
1636 			error(_("duplicated name '%s' in data frame using '.'"),
1637 			      c);
1638 		term = AllocTermSetBit1(install(c));
1639 #ifdef DEBUG_terms
1640 		Rprintf(".. in 'isSymbol(<dotSymbol>), after AllocT...1()\n");
1641 #endif
1642 		if(i == 0) PROTECT(v = r = cons(term, R_NilValue));
1643 		else {SETCDR(v, CONS(term, R_NilValue)); v = CDR(v);}
1644 	    }
1645 	    UNPROTECT(1);
1646 	    vmaxset(vmax);
1647 	    return r;
1648 	}
1649 	else {
1650 	    term = AllocTermSetBit1(formula);
1651 #ifdef DEBUG_terms
1652 	    Rprintf(".. in 'isSymbol(%s) [regular], after AllocT...1()\n",
1653 		    CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1654 #endif
1655 	    return CONS(term, R_NilValue);
1656 	}
1657     }
1658     if (isLanguage(formula)) {
1659 	if (CAR(formula) == tildeSymbol) {
1660 	    if (isNull(CDDR(formula)))
1661 		return EncodeVars(CADR(formula));
1662 	    else
1663 		return EncodeVars(CADDR(formula));
1664 	}
1665 	if (CAR(formula) == plusSymbol) {
1666 	    int len = length(formula);
1667 	    if (len == 2) // unary '+'
1668 		return EncodeVars(CADR(formula));
1669 	    else
1670 		return PlusTerms(CADR(formula), CADDR(formula));
1671 	}
1672 	if (CAR(formula) == colonSymbol) {
1673 	    return InteractTerms(CADR(formula), CADDR(formula));
1674 	}
1675 	if (CAR(formula) == timesSymbol) {
1676 	    return CrossTerms(CADR(formula), CADDR(formula));
1677 	}
1678 	if (CAR(formula) == inSymbol) {
1679 	    return InTerms(CADR(formula), CADDR(formula));
1680 	}
1681 	if (CAR(formula) == slashSymbol) {
1682 	    return NestTerms(CADR(formula), CADDR(formula));
1683 	}
1684 	if (CAR(formula) == powerSymbol) {
1685 	    return PowerTerms(CADR(formula), CADDR(formula));
1686 	}
1687 	if (CAR(formula) == minusSymbol) {
1688 	    int len = length(formula);
1689 	    if (len == 2) // unary '-'
1690 		return DeleteTerms(R_NilValue, CADR(formula));
1691 	    return DeleteTerms(CADR(formula), CADDR(formula));
1692 	}
1693 	if (CAR(formula) == parenSymbol) {
1694 	    return EncodeVars(CADR(formula));
1695 	}
1696 	term = AllocTermSetBit1(formula);
1697 #ifdef DEBUG_terms
1698 	Rprintf(".. before returning from EncodeVars(%s), after AllocT...1()\n",
1699 		CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1700 #endif
1701 	return CONS(term, R_NilValue);
1702     }
1703     error(_("invalid model formula in EncodeVars"));
1704     return R_NilValue;/*NOTREACHED*/
1705 }
1706 
1707 
1708 /* TermCode decides on the encoding of a model term. */
1709 /* Returns 1 if variable ``whichBit'' in ``thisTerm'' */
1710 /* is to be encoded by contrasts and 2 if it is to be */
1711 /* encoded by dummy variables.  This is decided using */
1712 /* the heuristic described in Statistical Models in S, page 38. */
1713 
1714 // called only in one place, in "step 4"
TermCode(SEXP termlist,SEXP thisterm,int whichbit,SEXP term)1715 static int TermCode(SEXP termlist, SEXP thisterm, int whichbit, SEXP term)
1716 {
1717     int *term_ = INTEGER(term),
1718 	*th_t  = INTEGER(CAR(thisterm));
1719     for (int i = 0; i < nwords; i++)
1720 	term_[i] = th_t[i];
1721 
1722     /* Eliminate factor ``whichbit'' */
1723     SetBit(term, whichbit, 0);
1724 
1725     /* Search preceding terms for a match */
1726     /* Zero is a possibility - it is a special case */
1727 
1728     int allzero = 1;
1729     for (int i = 0; i < nwords; i++) {
1730 	if (term_[i]) {
1731 	    allzero = 0;
1732 	    break;
1733 	}
1734     }
1735     if (allzero)
1736 	return 1;
1737 
1738     for (SEXP t = termlist; t != thisterm; t = CDR(t)) {
1739 	allzero = 1;
1740 	int *ct = INTEGER(CAR(t));
1741 	for (int i = 0; i < nwords; i++)
1742 	    if (term_[i] & ~ct[i]) {
1743 		allzero = 0; break;
1744 	    }
1745 	if (allzero)
1746 	    return 1;
1747     }
1748     return 2;
1749 }
1750 
1751 
1752 /* Internal code for the ``terms'' function */
1753 /* The value is a formula with an assortment */
1754 /* of useful attributes. */
1755 
1756 // R's  terms.formula(x, specials, data, keep.order, allowDotAsName)   in ../R/models.R
termsform(SEXP args)1757 SEXP termsform(SEXP args)
1758 {
1759     args = CDR(args); // (called via .External)
1760 
1761     /* Always fetch these values rather than trying to remember them
1762        between calls.  The overhead is minimal. */
1763 
1764     tildeSymbol = install("~");
1765     plusSymbol  = install("+");
1766     minusSymbol = install("-");
1767     timesSymbol = install("*");
1768     slashSymbol = install("/");
1769     colonSymbol = install(":");
1770     powerSymbol = install("^");
1771     dotSymbol   = install(".");
1772     parenSymbol = install("(");
1773     inSymbol = install("%in%");
1774 
1775     /* Do we have a model formula? <==> Check for unary or binary ~ */
1776 
1777     if (!isLanguage(CAR(args)) ||
1778 	CAR(CAR(args)) != tildeSymbol ||
1779 	(length(CAR(args)) != 2 && length(CAR(args)) != 3))
1780 	error(_("argument is not a valid model"));
1781 
1782     haveDot = FALSE;
1783 
1784     SEXP ans = PROTECT(duplicate(CAR(args)));
1785 
1786     /* The formula will be returned, modified if haveDot becomes TRUE */
1787 
1788     SEXP specials = CADR(args);
1789     if(length(specials) && !isString(specials))
1790 	error(_("'specials' must be NULL or a character vector"));
1791     SEXP
1792 	a = CDDR(args),
1793 	/* We use data to get the value to substitute for "." in formulae */
1794 	data = CAR(a);
1795     a = CDR(a);
1796     if (isNull(data) || isEnvironment(data))
1797 	framenames = R_NilValue;
1798     else if (isFrame(data))
1799 	framenames = getAttrib(data, R_NamesSymbol);
1800     else
1801 	error(_("'data' argument is of the wrong type"));
1802     PROTECT_WITH_INDEX(framenames, &vpi);
1803 
1804     Rboolean hadFrameNames = FALSE;
1805     if (framenames != R_NilValue) {
1806 	if(length(framenames)) hadFrameNames = TRUE;
1807 	if (length(CAR(args)) == 3)
1808 	    CheckRHS(CADR(CAR(args)));
1809     }
1810 
1811     /* Preserve term order? */
1812     int keepOrder = asLogical(CAR(a));
1813     if (keepOrder == NA_LOGICAL)
1814 	keepOrder = 0;
1815 
1816     a = CDR(a);
1817     int allowDot = asLogical(CAR(a));
1818     if (allowDot == NA_LOGICAL) allowDot = 0;
1819 
1820     // a := attributes(<answer>)
1821     a = allocList((specials == R_NilValue) ? 8 : 9);
1822     SET_ATTRIB(ans, a);
1823 
1824     /* Step 1: Determine the ``variables'' in the model :
1825      * ------ Here we create a call of the form list(...).
1826      * You can evaluate it to get the model variables or use substitute
1827      * and then pull the result apart to get the variable names. */
1828 
1829     intercept = 1;
1830     parity = 1;
1831     response = 0;
1832     PROTECT(varlist = LCONS(install("list"), R_NilValue));
1833 #ifdef DEBUG_terms
1834     n_xVars = 0; // the nesting level of ExtractVars()
1835     Rprintf("termsform(): calling ExtractVars(CAR(args), 1), where\n CAR(args)='%s'\n",
1836 	    CHAR(STRING_ELT(deparse1line(CAR(args), 0), 0)));
1837 #endif
1838     ExtractVars(CAR(args));
1839     //^^^^^^^^^ fills the 'varlist' = attr(<terms>, "variables")
1840     UNPROTECT(1);
1841     SETCAR(a, varlist);
1842     SET_TAG(a, install("variables"));
1843     a = CDR(a);
1844 
1845     int nvar = length(varlist) - 1; /* Number of variables in the formula */
1846 
1847     /* in allocating words need to allow for intercept term (PR#15735) */
1848     nwords = nvar/WORDSIZE + 1; // global; used & incremented in EncodeVars()
1849 #ifdef DEBUG_terms
1850     Rprintf(" .. after ExtractVars(): ini. nvar = %d, nwords = %d;%sCalling EncodeVars():%s",
1851 	    nvar, nwords,
1852 	    "\n------------------\n",
1853 	    "\n------------------\n");
1854 #endif
1855 
1856     /* Step 2: Recode the model terms in binary form
1857      * ------  and at the same time, expand the model formula. */
1858 
1859     /* FIXME: this includes 'specials' in the model */
1860     /* There perhaps needs to be a an extra pass */
1861     /* through the model to delete any terms which */
1862     /* contain specials.  Actually, specials should */
1863     /* only enter additively so this should also be */
1864     /* checked and abort forced if not. */
1865 
1866     /* BDR 2002-01-29: S does include specials, so code may rely on this */
1867 
1868     /* FIXME: this is also the point where nesting needs to be taken care of. */
1869 
1870     SEXP formula = PROTECT(EncodeVars(CAR(args)));
1871     //                     ^^^^^^^^^^
1872     if(length(varlist) != nvar + 1) {
1873 	warning(_("'varlist' has changed (from nvar=%d) to new %d after EncodeVars() -- should no longer happen!"),
1874 		nvar, length(varlist) - 1);
1875 	nvar = length(varlist) - 1;
1876     }
1877 #ifdef DEBUG_terms
1878     Rprintf("after EncodeVars(): final (nvar,nwords) = (%d,%d); formula 'L' =\n", nvar, nwords);
1879     // Show the content of formula  [[ Do we have integer(nwords) or just int.(1) ? ]]
1880     printPList(formula);
1881 #endif
1882 
1883     /* Step 2a: Compute variable names */
1884 
1885     SEXP v, call, varnames = PROTECT(allocVector(STRSXP, nvar));
1886     {
1887 	R_xlen_t i;
1888 	for (v = CDR(varlist), i = 0; v != R_NilValue; v = CDR(v))
1889 	    SET_STRING_ELT(varnames, i++,
1890 			   STRING_ELT(deparse1line(CAR(v), 0), 0));
1891     }
1892 #ifdef DEBUG_terms
1893     Rprintf("after Step 2a: variable names:\n"); printVector(varnames, 1, 1);
1894 #endif
1895 
1896     /* Step 2b: Find and remove any offset(s) */
1897 
1898     /* first see if any of the variables are offsets */
1899     R_xlen_t k = 0;
1900     for (R_xlen_t l = response; l < nvar; l++)
1901 	if (!strncmp(CHAR(STRING_ELT(varnames, l)), "offset(", 7)) k++;
1902     if (k > 0) {
1903 #ifdef DEBUG_terms
1904 	Rprintf(" step 2b: found k=%ld offset(.)s\n", k);
1905 #endif
1906 	Rboolean foundOne = FALSE; /* has there been a non-offset term? */
1907 	/* allocate the "offsets" attribute */
1908 	SETCAR(a, v = allocVector(INTSXP, k));
1909 	for (int l = response, k = 0; l < nvar; l++)
1910 	    if (!strncmp(CHAR(STRING_ELT(varnames, l)), "offset(", 7))
1911 		INTEGER(v)[k++] = l + 1;
1912 	SET_TAG(a, install("offset"));
1913 	a = CDR(a);
1914 	/* now remove offset terms from the formula */
1915 	call = formula; /* call is to be the previous term once one is found */
1916 	while (1) {
1917 	    SEXP thisterm = foundOne ? CDR(call) : call;
1918 	    Rboolean have_offset = FALSE;
1919 #ifdef DEBUG_terms
1920 	    Rprintf(" while (1) : foundOne = %d; length(thisterm) =%d; ",
1921 		   foundOne, length(thisterm));
1922 #endif
1923 	    if(length(thisterm) == 0) break;
1924 	    for (int i = 1; i <= nvar; i++)
1925 		if (GetBit(CAR(thisterm), i) &&
1926 		    !strncmp(CHAR(STRING_ELT(varnames, i-1)), "offset(", 7)) {
1927 		    have_offset = TRUE;
1928 #ifdef DEBUG_terms
1929 		    Rprintf(" i=%d: have_offset, ", i);
1930 #endif
1931 		    break;
1932 		}
1933 	    if (have_offset) {
1934 		if (!foundOne) call = formula = CDR(formula);
1935 		else SETCDR(call, CDR(thisterm));
1936 	    } else {
1937 		if (foundOne) call = CDR(call);
1938 		else foundOne = TRUE;
1939 	    }
1940 	}
1941     }
1942     int nterm = length(formula); /* = #{model terms} */
1943 #ifdef DEBUG_terms
1944     Rprintf("after step 2: k=%ld, nterm:= |formula| = %d\n", k, nterm);
1945     // fails: 'formula' is "pairlist", not "vector" !!
1946     // Rprintf("formula[1..nterm=%d] = ", nterm); printVector(formula, 1, 0);
1947 
1948     /* Step 3: Reorder the model terms by BitCount, otherwise
1949      * ------  preserving their order. */
1950 
1951     Rprintf(" .. step 3 .. reorder model terms by BitCount():\n");
1952 #endif
1953     SEXP    ord = PROTECT(allocVector(INTSXP, nterm)),
1954 	pattern = PROTECT(allocVector(VECSXP, nterm));
1955     {
1956 	SEXP sCounts = PROTECT(allocVector(INTSXP, nterm));
1957 	int bitmax = 0,
1958 	    *iord = INTEGER(ord),
1959 	    *counts = INTEGER(sCounts);
1960 	R_xlen_t n;
1961 	for (call = formula, n = 0; call != R_NilValue; call = CDR(call), n++) {
1962 	    SET_VECTOR_ELT(pattern, n, CAR(call));
1963 	    counts[n] = BitCount(CAR(call), nvar);
1964 #ifdef DEBUG_terms
1965 	    int lc = length(CAR(call));
1966 	    Rprintf("  -> BitCount(CAR(call),nv) =:counts[n=%ld]=%2d, from bitpat.in int[%s%d]): ",
1967 		    n+1, counts[n], (lc==1)?"":"1:", lc);
1968 	    printVector(CAR(call), 0, 0);
1969 #endif
1970 	}
1971 	for (n = 0; n < nterm; n++)
1972 	    if(counts[n] > bitmax) bitmax = counts[n];
1973 #ifdef DEBUG_terms
1974 	Rprintf("step 3 (part I): counts[1..nterm]: "); printVector(sCounts, 1, 0);
1975 	Rprintf("     bitmax = max(counts[]) = %d\n", bitmax);
1976 #endif
1977 
1978 	if(keepOrder) {
1979 	    for (n = 0; n < nterm; n++)
1980 		iord[n] = counts[n];
1981 	} else {
1982 	    call = formula;
1983 	    int m = 0;
1984 	    for (int i = 0; i <= bitmax; i++) /* can order 0 occur? */
1985 		for (n = 0; n < nterm; n++)
1986 		    if (counts[n] == i) {
1987 			SETCAR(call, VECTOR_ELT(pattern, n));
1988 			call = CDR(call);
1989 			iord[m++] = i;
1990 		    }
1991 	}
1992 	UNPROTECT(2);
1993     }
1994 #ifdef DEBUG_terms
1995     Rprintf("after step 3: ord[1:nterm]: "); printVector(ord, 1, 0);
1996     Rprintf("--=--\n .. step 4: Create \"factors\" pattern matrix:\n");
1997 #endif
1998 
1999     /* Step 4: Compute the factor pattern for the model. */
2000     /* 0 - the variable does not appear in this term. */
2001     /* 1 - code the variable by contrasts in this term. */
2002     /* 2 - code the variable by indicators in this term. */
2003 
2004     if (nterm > 0) {
2005 	SETCAR(a, pattern = allocMatrix(INTSXP, nvar, nterm));
2006 	int *pattn = INTEGER(pattern);
2007 	for (R_xlen_t i = 0; i < ((R_xlen_t) nterm) * nvar; i++)
2008 	    pattn[i] = 0;
2009 	SEXP term = PROTECT(AllocTerm());
2010 	R_xlen_t n_n = -1; // n = 0;  ==>  n_n = -1 + n*nvar = -1
2011 	for (call = formula; call != R_NilValue; call = CDR(call)) {
2012 #ifdef DEBUG_terms
2013 	    Rprintf("  st.4: (bitpattern in int) term: "); printVector(CAR(call), 0,0);
2014 #endif
2015 	    for (int i = 1; i <= nvar; i++) {
2016 		if (GetBit(CAR(call), i)) {
2017 #ifdef DEBUG_terms
2018 		    Rprintf(" var i=%d --> TermCode(., c., i, t.) {bit[i] := 0}\n", i);
2019 #endif
2020 		    pattn[i+n_n] = TermCode(formula, call, i, term);
2021 #ifdef DEBUG_terms
2022 		    Rprintf(" => pattn[%d,%d] = %d\n", (n_n+1)/nvar, i, pattn[i+n_n]);
2023 #endif
2024 		}
2025 	    }
2026 	    n_n += nvar; // n++ ==>  n_n = -1 + n*nvar
2027 	}
2028 	UNPROTECT(1);
2029     }
2030     else {
2031 	SETCAR(a, pattern = allocVector(INTSXP,0));
2032     }
2033     SET_TAG(a, install("factors"));
2034     a = CDR(a);
2035 
2036 #ifdef DEBUG_terms
2037     Rprintf(".. after step 4: filled \"factors\" matrix (nvar x nterm) = (%d x %d)\n",
2038 	    nvar, nterm);
2039     Rprintf("--=--\n .. step 5 .. computing term labels \"term.labels\":\n");
2040 #endif
2041 
2042     /* Step 5: Compute term labels */
2043 
2044     SEXP termlabs = PROTECT(allocVector(STRSXP, nterm));
2045     R_xlen_t n = 0;
2046     for (call = formula; call != R_NilValue; call = CDR(call)) {
2047 	R_xlen_t l = 0;
2048 #ifdef DEBUG_terms
2049 	Rprintf("  st.5: (bitpattern in int) term: "); printVector(CAR(call), 0, 0);
2050 	Rprintf("  ----  {not tracing GetBit() when determing 'cbuf' length}\n");
2051 	trace_GetBit = FALSE; // *not* tracing below
2052 #endif
2053 	for (int i = 1; i <= nvar; i++) {
2054 	    if (GetBit(CAR(call), i)) {
2055 		if (l > 0)
2056 		    l += 1;
2057 		l += (int) strlen(CHAR(STRING_ELT(varnames, i - 1)));
2058 	    }
2059 	}
2060 #ifdef DEBUG_terms
2061 	trace_GetBit = TRUE; // back to tracing
2062 	Rprintf("     --> cbuf length %d (+1 for final \\0)\n", l);
2063 #endif
2064 	char cbuf[l+1];
2065 	cbuf[0] = '\0';
2066 	l = 0;
2067 	for (int i = 1; i <= nvar; i++) {
2068 	    if (GetBit(CAR(call), i)) {
2069 		if (l > 0)
2070 		    strcat(cbuf, ":");
2071 		strcat(cbuf, CHAR(STRING_ELT(varnames, i - 1)));
2072 		l++;
2073 	    }
2074 	}
2075 	SET_STRING_ELT(termlabs, n, mkChar(cbuf));
2076 	n++;
2077 #ifdef DEBUG_terms
2078 	Rprintf("  -> term.labels[%ld]: '%s'\n", n, cbuf);
2079 #endif
2080     }
2081 
2082 #ifdef DEBUG_terms
2083     Rprintf(".. finished step 5: term.labels: "); printVector(termlabs, 1, /* quote */ 1);
2084 #endif
2085 
2086     if (nterm > 0) { // dimnames("factors") <- ...
2087 	PROTECT(v = allocVector(VECSXP, 2));
2088 	SET_VECTOR_ELT(v, 0, varnames);
2089 	SET_VECTOR_ELT(v, 1, termlabs);
2090 	setAttrib(pattern, R_DimNamesSymbol, v);
2091 	UNPROTECT(1);
2092     }
2093     SETCAR(a, termlabs);
2094     UNPROTECT(1); // termlabs
2095     SET_TAG(a, install("term.labels"));
2096     a = CDR(a);
2097 
2098     /* If there are specials stick them in here */
2099 
2100     if (specials != R_NilValue) {
2101 	R_xlen_t j;
2102 	const void *vmax = vmaxget();
2103 	int i = length(specials);
2104 	SEXP t;
2105 	PROTECT(v = allocList(i));
2106 	for (j = 0, t = v; j < i; j++, t = CDR(t)) {
2107 	    const char *ss = translateChar(STRING_ELT(specials, j));
2108 	    SET_TAG(t, install(ss));
2109 	    R_xlen_t n = (int) strlen(ss);
2110 	    SETCAR(t, allocVector(INTSXP, 0));
2111 	    R_xlen_t k = 0;
2112 	    for (R_xlen_t l = 0; l < nvar; l++) {
2113 		if (!strncmp(CHAR(STRING_ELT(varnames, l)), ss, n))
2114 		    if (CHAR(STRING_ELT(varnames, l))[n] == '(')
2115 			k++;
2116 	    }
2117 	    if (k > 0) {
2118 		SETCAR(t, allocVector(INTSXP, k));
2119 		k = 0;
2120 		for (int l = 0; l < nvar; l++) {
2121 		    if (!strncmp(CHAR(STRING_ELT(varnames, l)), ss, n))
2122 			if (CHAR(STRING_ELT(varnames, l))[n] == '('){
2123 			    INTEGER(CAR(t))[k++] = l+1;
2124 			}
2125 		}
2126 	    }
2127 	    else SETCAR(t, R_NilValue);
2128 	}
2129 	SETCAR(a, v);
2130 	SET_TAG(a, install("specials"));
2131 	a = CDR(a);
2132 	UNPROTECT(1);
2133 	vmaxset(vmax);
2134     }
2135 
2136 
2137     /* Step 6: Fix up the formula by substituting for dot, which should be
2138        the framenames joined by + */
2139 
2140     if (haveDot) {
2141 	if(length(framenames)) {
2142 	    SEXP rhs;
2143 	    PROTECT_INDEX ind;
2144 	    PROTECT_WITH_INDEX(rhs = installTrChar(STRING_ELT(framenames, 0)),
2145 			       &ind);
2146 	    for (R_xlen_t i = 1; i < LENGTH(framenames); i++) {
2147 		REPROTECT(rhs = lang3(plusSymbol, rhs,
2148 				      installTrChar(STRING_ELT(framenames, i))),
2149 			  ind);
2150 	    }
2151 	    if (!isNull(CADDR(ans)))
2152 		SETCADDR(ans, ExpandDots(CADDR(ans), rhs));
2153 	    else
2154 		SETCADR(ans, ExpandDots(CADR(ans), rhs));
2155 	    UNPROTECT(1);
2156 	} else if(!allowDot && !hadFrameNames) {
2157 	    error(_("'.' in formula and no 'data' argument"));
2158 	}
2159     }
2160 
2161     SETCAR(a, allocVector(INTSXP, nterm));
2162     {
2163 	R_xlen_t n = 0;
2164 	int *ia = INTEGER(CAR(a)), *iord = INTEGER(ord);
2165 	for (call = formula; call != R_NilValue; call = CDR(call), n++)
2166 	    ia[n] = iord[n];
2167     }
2168 
2169     SET_TAG(a, install("order"));
2170     a = CDR(a);
2171 
2172     SETCAR(a, ScalarInteger(intercept != 0));
2173     SET_TAG(a, install("intercept"));
2174     a = CDR(a);
2175 
2176     SETCAR(a, ScalarInteger(response != 0));
2177     SET_TAG(a, install("response"));
2178     a = CDR(a);
2179 
2180     SETCAR(a, mkString("terms"));
2181     SET_TAG(a, R_ClassSymbol);
2182     SET_OBJECT(ans, 1);
2183 
2184     SETCDR(a, R_NilValue);  /* truncate if necessary */
2185 
2186     UNPROTECT(5);
2187     return ans;
2188 }
2189 // termsform()
2190