1 /*
2 * R : A Computer Language for Statistical Data Analysis
3 * Copyright (C) 1997--2020 The R Core Team
4 * Copyright (C) 1995, 1996 Robert Gentleman and Ross Ihaka
5 *
6 * This program is free software; you can redistribute it and/or modify
7 * it under the terms of the GNU General Public License as published by
8 * the Free Software Foundation; either version 2 of the License, or
9 * (at your option) any later version.
10 *
11 * This program is distributed in the hope that it will be useful,
12 * but WITHOUT ANY WARRANTY; without even the implied warranty of
13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
14 * GNU General Public License for more details.
15 *
16 * You should have received a copy of the GNU General Public License
17 * along with this program; if not, a copy is available at
18 * https://www.R-project.org/Licenses/
19 */
20
21 #ifdef HAVE_CONFIG_H
22 #include <config.h>
23 #endif
24
25 #include <Defn.h>
26
27 #include "statsR.h"
28 #undef _
29 #ifdef ENABLE_NLS
30 #include <libintl.h>
31 #define _(String) dgettext ("stats", String)
32 #else
33 #define _(String) (String)
34 #endif
35
36 /* inline-able versions, used just once! */
isUnordered_int(SEXP s)37 static R_INLINE Rboolean isUnordered_int(SEXP s)
38 {
39 return (TYPEOF(s) == INTSXP
40 && inherits(s, "factor")
41 && !inherits(s, "ordered"));
42 }
43
isOrdered_int(SEXP s)44 static R_INLINE Rboolean isOrdered_int(SEXP s)
45 {
46 return (TYPEOF(s) == INTSXP
47 && inherits(s, "factor")
48 && inherits(s, "ordered"));
49 }
50
51 /*
52 * model.frame
53 *
54 * The argument "terms" contains the terms object generated from the
55 * model formula. We first evaluate the "variables" attribute of
56 * "terms" in the "data" environment. This gives us a list of basic
57 * variables to be in the model frame. We do some basic sanity
58 * checks on these to ensure that resulting object make sense.
59 *
60 * The argument "dots" gives additional things like "weights", "offsets"
61 * and "subset" which will also go into the model frame so that they can
62 * be treated in parallel.
63 *
64 * Next we subset the data frame according to "subset" and finally apply
65 * "na.action" to get the final data frame.
66 *
67 * Note that the "terms" argument is glued to the model frame as an
68 * attribute. Code downstream appears to need this.
69 *
70 */
71
72 /* model.frame(terms, rownames, variables, varnames, */
73 /* dots, dotnames, subset, na.action) */
74
modelframe(SEXP call,SEXP op,SEXP args,SEXP rho)75 SEXP modelframe(SEXP call, SEXP op, SEXP args, SEXP rho)
76 {
77 SEXP terms, data, names, variables, varnames, dots, dotnames, na_action;
78 SEXP ans, row_names, subset, tmp;
79 char buf[256];
80 int i, j, nr, nc;
81 int nvars, ndots, nactualdots;
82 const void *vmax = vmaxget();
83
84 args = CDR(args);
85 terms = CAR(args); args = CDR(args);
86 row_names = CAR(args); args = CDR(args);
87 variables = CAR(args); args = CDR(args);
88 varnames = CAR(args); args = CDR(args);
89 dots = CAR(args); args = CDR(args);
90 dotnames = CAR(args); args = CDR(args);
91 subset = CAR(args); args = CDR(args);
92 na_action = CAR(args);
93
94 /* Argument Sanity Checks */
95
96 if (!isNewList(variables))
97 error(_("invalid variables"));
98 if (!isString(varnames))
99 error(_("invalid variable names"));
100 if ((nvars = length(variables)) != length(varnames))
101 error(_("number of variables != number of variable names"));
102
103 if (!isNewList(dots))
104 error(_("invalid extra variables"));
105 if ((ndots = length(dots)) != length(dotnames))
106 error(_("number of variables != number of variable names"));
107 if ( ndots && !isString(dotnames))
108 error(_("invalid extra variable names"));
109
110 /* check for NULL extra arguments -- moved from interpreted code */
111
112 nactualdots = 0;
113 for (i = 0; i < ndots; i++)
114 if (VECTOR_ELT(dots, i) != R_NilValue) nactualdots++;
115
116 /* Assemble the base data frame. */
117
118 PROTECT(data = allocVector(VECSXP, nvars + nactualdots));
119 PROTECT(names = allocVector(STRSXP, nvars + nactualdots));
120
121 for (i = 0; i < nvars; i++) {
122 SET_VECTOR_ELT(data, i, VECTOR_ELT(variables, i));
123 SET_STRING_ELT(names, i, STRING_ELT(varnames, i));
124 }
125 for (i = 0,j = 0; i < ndots; i++) {
126 const char *ss;
127 if (VECTOR_ELT(dots, i) == R_NilValue) continue;
128 ss = translateChar(STRING_ELT(dotnames, i));
129 if(strlen(ss) + 3 > 256) error(_("overlong names in '%s'"), ss);
130 snprintf(buf, 256, "(%s)", ss);
131 SET_VECTOR_ELT(data, nvars + j, VECTOR_ELT(dots, i));
132 SET_STRING_ELT(names, nvars + j, mkChar(buf));
133 j++;
134 }
135 setAttrib(data, R_NamesSymbol, names);
136 UNPROTECT(2);
137
138 /* Sanity checks to ensure that the the answer can become */
139 /* a data frame. Be deeply suspicious here! */
140
141 nc = length(data);
142 nr = 0; /* -Wall */
143 if (nc > 0) {
144 nr = nrows(VECTOR_ELT(data, 0));
145 for (i = 0; i < nc; i++) {
146 ans = VECTOR_ELT(data, i);
147 switch(TYPEOF(ans)) {
148 case LGLSXP:
149 case INTSXP:
150 case REALSXP:
151 case CPLXSXP:
152 case STRSXP:
153 case RAWSXP:
154 break;
155 default:
156 error(_("invalid type (%s) for variable '%s'"),
157 type2char(TYPEOF(ans)),
158 translateChar(STRING_ELT(names, i)));
159 }
160 if (nrows(ans) != nr)
161 error(_("variable lengths differ (found for '%s')"),
162 translateChar(STRING_ELT(names, i)));
163 }
164 } else nr = length(row_names);
165
166 PROTECT(data);
167 PROTECT(subset);
168
169 /* Turn the data "list" into a "data.frame" */
170 /* so that subsetting methods will work. */
171 /* To do this we must attach "class" and */
172 /* "row.names" attributes */
173
174 PROTECT(tmp = mkString("data.frame"));
175 setAttrib(data, R_ClassSymbol, tmp);
176 UNPROTECT(1);
177 if (length(row_names) == nr) {
178 setAttrib(data, R_RowNamesSymbol, row_names);
179 } else {
180 /*
181 PROTECT(row_names = allocVector(INTSXP, nr));
182 for (i = 0; i < nr; i++) INTEGER(row_names)[i] = i+1; */
183 PROTECT(row_names = allocVector(INTSXP, 2));
184 INTEGER(row_names)[0] = NA_INTEGER;
185 INTEGER(row_names)[1] = nr;
186 setAttrib(data, R_RowNamesSymbol, row_names);
187 UNPROTECT(1);
188 }
189
190 /* Do the subsetting, if required. */
191 /* Need to save and restore 'most' attributes */
192
193 if (subset != R_NilValue) {
194 PROTECT(tmp=install("[.data.frame"));
195 PROTECT(tmp=LCONS(tmp,list4(data,subset,R_MissingArg,mkFalse())));
196 data = eval(tmp, rho);
197 UNPROTECT(2);
198 }
199 UNPROTECT(2);
200 PROTECT(data);
201
202 /* finally, we run na.action on the data frame */
203 /* usually, this will be na.omit */
204
205 if (na_action != R_NilValue) {
206 /* some na.actions need this to distinguish responses from
207 explanatory variables */
208 setAttrib(data, install("terms"), terms);
209 if (isString(na_action) && length(na_action) > 0)
210 na_action = installTrChar(STRING_ELT(na_action, 0));
211 PROTECT(na_action);
212 PROTECT(tmp = lang2(na_action, data));
213 ans = eval(tmp, rho);
214 if (MAYBE_REFERENCED(ans))
215 ans = shallow_duplicate(ans);
216 PROTECT(ans);
217 if (!isNewList(ans) || length(ans) != length(data))
218 error(_("invalid result from na.action"));
219 /* need to transfer _all but tsp and dim_ attributes, possibly lost
220 by subsetting in na.action. */
221 /* But if data is unchanged, don't mess with it (PR#16436) */
222
223 for ( i = length(ans) ; i-- ; )
224 if (VECTOR_ELT(data, i) != VECTOR_ELT(ans, i)) {
225 if (MAYBE_REFERENCED(VECTOR_ELT(ans, i)))
226 SET_VECTOR_ELT(ans, i,
227 shallow_duplicate(VECTOR_ELT(ans, i)));
228 copyMostAttribNoTs(VECTOR_ELT(data, i),VECTOR_ELT(ans, i));
229 }
230
231 UNPROTECT(3);
232 }
233 else ans = data;
234 UNPROTECT(1);
235 PROTECT(ans);
236
237 /* Finally, tack on a terms attribute
238 Now done at R level.
239 setAttrib(ans, install("terms"), terms); */
240 UNPROTECT(1);
241 vmaxset(vmax);
242 return ans;
243 }
244
245
246 /* The code below is related to model expansion */
247 /* and is ultimately called by modelmatrix. */
248
firstfactor(double * x,int nrx,int ncx,double * c,int nrc,int ncc,int * v,int adj)249 static void firstfactor(double *x, int nrx, int ncx,
250 double *c, int nrc, int ncc, int *v, int adj)
251 {
252 double *cj, *xj;
253
254 for (int j = 0; j < ncc; j++) {
255 xj = &x[j * (R_xlen_t)nrx];
256 cj = &c[j * (R_xlen_t)nrc];
257 for (int i = 0; i < nrx; i++)
258 if(v[i] == NA_INTEGER) xj[i] = NA_REAL;
259 else xj[i] = cj[v[i]-1+adj];
260 }
261 }
262
addfactor(double * x,int nrx,int ncx,double * c,int nrc,int ncc,int * v,int adj)263 static void addfactor(double *x, int nrx, int ncx,
264 double *c, int nrc, int ncc, int *v, int adj)
265 {
266 double *ck, *xj, *yj;
267
268 for (int k = ncc - 1; k >= 0; k--) {
269 for (int j = 0; j < ncx; j++) {
270 xj = &x[j * (R_xlen_t)nrx];
271 yj = &x[(k * (R_xlen_t)ncx + j)*nrx];
272 ck = &c[k * (R_xlen_t)nrc];
273 for (int i = 0; i < nrx; i++)
274 if(v[i] == NA_INTEGER) yj[i] = NA_REAL;
275 else yj[i] = ck[v[i]-1+adj] * xj[i];
276 }
277 }
278 }
279
firstvar(double * x,int nrx,int ncx,double * c,int nrc,int ncc)280 static void firstvar(double *x, int nrx, int ncx, double *c, int nrc, int ncc)
281 {
282 double *cj, *xj;
283
284 for (int j = 0; j < ncc; j++) {
285 xj = &x[j * (R_xlen_t)nrx];
286 cj = &c[j * (R_xlen_t)nrc];
287 for (int i = 0; i < nrx; i++)
288 xj[i] = cj[i];
289 }
290 }
291
addvar(double * x,int nrx,int ncx,double * c,int nrc,int ncc)292 static void addvar(double *x, int nrx, int ncx, double *c, int nrc, int ncc)
293 {
294 double *ck, *xj, *yj;
295
296 for (int k = ncc - 1; k >= 0; k--) {
297 for (int j = 0; j < ncx; j++) {
298 xj = &x[j * (R_xlen_t)nrx];
299 yj = &x[(k * (R_xlen_t)ncx + j)*nrx];
300 ck = &c[k * (R_xlen_t)nrc];
301 for (int i = 0; i < nrx; i++)
302 yj[i] = ck[i] * xj[i];
303 }
304 }
305 }
306
307 #define BUFSIZE 4096
308
AppendString(char * buf,const char * str)309 static char *AppendString(char *buf, const char *str)
310 {
311 while (*str)
312 *buf++ = *str++;
313 *buf = '\0';
314 return buf;
315 }
316
AppendInteger(char * buf,int i)317 static char *AppendInteger(char *buf, int i)
318 {
319 sprintf(buf, "%d", i);
320 while(*buf) buf++;
321 return buf;
322 }
323
ColumnNames(SEXP x)324 static SEXP ColumnNames(SEXP x)
325 {
326 SEXP dn = getAttrib(x, R_DimNamesSymbol);
327 if (dn == R_NilValue || length(dn) < 2)
328 return R_NilValue;
329 else
330 return VECTOR_ELT(dn, 1);
331 }
332
333 // called from R as .Externals2(C_modelmatrix, t, data)
modelmatrix(SEXP call,SEXP op,SEXP args,SEXP rho)334 SEXP modelmatrix(SEXP call, SEXP op, SEXP args, SEXP rho)
335 {
336 SEXP expr, factors, terms, vars, vnames, assign;
337 SEXP xnames, tnames, rnames;
338 SEXP count, contrast, contr1, contr2, nlevs, ordered, columns, x;
339 SEXP variable, var_i;
340 int fik, first, i, j, k, kk, ll, n, nc, nterms, nVar;
341 int intrcept, jstart, jnext, risponse, indx, rhs_response;
342 double dk, dnc;
343 char buf[BUFSIZE]="\0";
344 char *bufp;
345 const char *addp;
346 R_xlen_t nn;
347
348 args = CDR(args);
349
350 /* Get the "terms" structure and extract */
351 /* the intercept and response attributes. */
352 terms = CAR(args); // = 't' in R's calling code
353
354 intrcept = asLogical(getAttrib(terms, install("intercept")));
355 if (intrcept == NA_INTEGER)
356 intrcept = 0;
357
358 risponse = asLogical(getAttrib(terms, install("response")));
359 if (risponse == NA_INTEGER)
360 risponse = 0;
361
362 /* Get the factor pattern matrix. We duplicate this because */
363 /* we may want to alter it if we are in the no-intercept case. */
364 /* Note: the values of "nVar" and "nterms" are the REAL number of */
365 /* variables in the model data frame and the number of model terms. */
366
367 nVar = nterms = 0; /* -Wall */
368 PROTECT(factors = duplicate(getAttrib(terms, install("factors"))));
369 if (length(factors) == 0) {
370 /* if (intrcept == 0)
371 error("invalid model (zero parameters).");*/
372 nVar = 1;
373 nterms = 0;
374 }
375 else if (isInteger(factors) && isMatrix(factors)) {
376 nVar = nrows(factors);
377 nterms = ncols(factors);
378 }
379 else error(_("invalid '%s' argument"), "terms");
380
381 /* Get the variable names from the factor matrix */
382
383 vnames = getAttrib(factors, R_DimNamesSymbol);
384 if (length(factors) > 0) {
385 if (length(vnames) < 1 ||
386 (nVar - intrcept > 0 && !isString(VECTOR_ELT(vnames, 0))))
387 error(_("invalid '%s' argument"), "terms");
388 vnames = VECTOR_ELT(vnames, 0);
389 }
390
391 /* Get the variables from the model frame. First perform */
392 /* elementary sanity checks. Notes: 1) We need at least */
393 /* one variable (lhs or rhs) to compute the number of cases. */
394 /* 2) We don't type-check the response. */
395
396 vars = CADR(args);
397 if (!isNewList(vars) || length(vars) < nVar)
398 error(_("invalid model frame"));
399 if (length(vars) == 0)
400 error(_("do not know how many cases"));
401
402 nn = n = nrows(VECTOR_ELT(vars, 0));
403 /* This could be generated, so need to protect it */
404 PROTECT(rnames = getAttrib(vars, R_RowNamesSymbol));
405
406 /* This section of the code checks the types of the variables
407 in the model frame. Note that it should really only check
408 the variables if they appear in a term in the model.
409 Because it does not, we need to allow other types here, as they
410 might well occur on the LHS.
411 The R code converts all character variables in the model frame to
412 factors, so the only types that ought to be here are logical,
413 integer (including factor), numeric and complex.
414 */
415
416 PROTECT(variable = allocVector(VECSXP, nVar));
417 PROTECT(nlevs = allocVector(INTSXP, nVar));
418 PROTECT(ordered = allocVector(LGLSXP, nVar));
419 PROTECT(columns = allocVector(INTSXP, nVar));
420
421 for (i = 0; i < nVar; i++) {
422 var_i = SET_VECTOR_ELT(variable, i, VECTOR_ELT(vars, i));
423 if (nrows(var_i) != n)
424 error(_("variable lengths differ (found for variable %d)"), i);
425 if (isOrdered_int(var_i)) {
426 LOGICAL(ordered)[i] = 1;
427 if((INTEGER(nlevs)[i] = nlevels(var_i)) < 1)
428 error(_("variable %d has no levels"), i+1);
429 /* will get updated later when contrasts are set */
430 INTEGER(columns)[i] = ncols(var_i);
431 }
432 else if (isUnordered_int(var_i)) {
433 LOGICAL(ordered)[i] = 0;
434 if((INTEGER(nlevs)[i] = nlevels(var_i)) < 1)
435 error(_("variable %d has no levels"), i+1);
436 /* will get updated later when contrasts are set */
437 INTEGER(columns)[i] = ncols(var_i);
438 }
439 else if (isLogical(var_i)) {
440 LOGICAL(ordered)[i] = 0;
441 INTEGER(nlevs)[i] = 2;
442 INTEGER(columns)[i] = ncols(var_i);
443 }
444 else if (isNumeric(var_i)) {
445 SET_VECTOR_ELT(variable, i, coerceVector(var_i, REALSXP));
446 var_i = VECTOR_ELT(variable, i);
447 LOGICAL(ordered)[i] = 0;
448 INTEGER(nlevs)[i] = 0;
449 INTEGER(columns)[i] = ncols(var_i);
450 }
451 else {
452 LOGICAL(ordered)[i] = 0;
453 INTEGER(nlevs)[i] = 0;
454 INTEGER(columns)[i] = ncols(var_i);
455 }
456 /* else
457 error(_("invalid variable type for '%s'"),
458 CHAR(STRING_ELT(vnames, i))); */
459 }
460
461 /* If there is no intercept we look through the factor pattern */
462 /* matrix and adjust the code for the first factor found so that */
463 /* it will be coded by dummy variables rather than contrasts. */
464
465 if (!intrcept) {
466 for (j = 0; j < nterms; j++) {
467 for (i = risponse; i < nVar; i++) {
468 if (INTEGER(nlevs)[i] > 1
469 && INTEGER(factors)[i + j * nVar] > 0) {
470 INTEGER(factors)[i + j * nVar] = 2;
471 goto alldone;
472 }
473 }
474 }
475 }
476 alldone:
477 ;
478
479 /* Compute the required contrast or dummy variable matrices. */
480 /* We set up a symbolic expression to evaluate these, substituting */
481 /* the required arguments at call time. The calls have the following */
482 /* form: (contrast.type nlevs contrasts) */
483
484 PROTECT(contr1 = allocVector(VECSXP, nVar));
485 PROTECT(contr2 = allocVector(VECSXP, nVar));
486
487 PROTECT(expr = allocList(3));
488 SET_TYPEOF(expr, LANGSXP);
489 SETCAR(expr, install("contrasts"));
490 SETCADDR(expr, allocVector(LGLSXP, 1));
491
492 /* FIXME: We need to allow a third argument to this function */
493 /* which allows us to specify contrasts directly. That argument */
494 /* would be used here in exactly the same way as the below. */
495 /* I.e. we would search the list of constrast specs before */
496 /* we try the evaluation below. */
497
498 for (i = 0; i < nVar; i++) {
499 if (INTEGER(nlevs)[i]) {
500 k = 0;
501 for (j = 0; j < nterms; j++) {
502 if (INTEGER(factors)[i + j * nVar] == 1)
503 k |= 1;
504 else if (INTEGER(factors)[i + j * nVar] == 2)
505 k |= 2;
506 }
507 SETCADR(expr, VECTOR_ELT(variable, i));
508 if (k & 1) {
509 LOGICAL(CADDR(expr))[0] = 1;
510 SET_VECTOR_ELT(contr1, i, eval(expr, rho));
511 }
512 if (k & 2) {
513 LOGICAL(CADDR(expr))[0] = 0;
514 SET_VECTOR_ELT(contr2, i, eval(expr, rho));
515 }
516 }
517 }
518
519 /* By convention, an rhs term identical to the response generates nothing
520 in the model matrix (but interactions involving the response do). */
521
522 rhs_response = -1;
523 if (risponse > 0) /* there is a response specified */
524 for (j = 0; j < nterms; j++)
525 if (INTEGER(factors)[risponse - 1 + j * nVar]) {
526 for (i = 0, k = 0; i < nVar; i++)
527 k += INTEGER(factors)[i + j * nVar] > 0;
528 if (k == 1) {
529 rhs_response = j;
530 break;
531 }
532 }
533
534
535 /* We now have everything needed to build the design matrix. */
536 /* The first step is to compute the matrix size and to allocate it. */
537 /* Note that "count" holds a count of how many columns there are */
538 /* for each term in the model and "nc" gives the total column count. */
539
540 PROTECT(count = allocVector(INTSXP, nterms));
541 if (intrcept)
542 dnc = 1;
543 else
544 dnc = 0;
545 for (j = 0; j < nterms; j++) {
546 if (j == rhs_response) {
547 warning(_("the response appeared on the right-hand side and was dropped"));
548 INTEGER(count)[j] = 0; /* need this initialised */
549 continue;
550 }
551 dk = 1; /* accumulate in a double to detect overflow */
552 for (i = 0; i < nVar; i++) {
553 if (INTEGER(factors)[i + j * nVar]) {
554 if (INTEGER(nlevs)[i]) {
555 switch(INTEGER(factors)[i + j * nVar]) {
556 case 1:
557 dk *= ncols(VECTOR_ELT(contr1, i));
558 break;
559 case 2:
560 dk *= ncols(VECTOR_ELT(contr2, i));
561 break;
562 }
563 }
564 else dk *= INTEGER(columns)[i];
565 }
566 }
567 if (dk > INT_MAX) error(_("term %d would require %.0g columns"), j+1, dk);
568 INTEGER(count)[j] = (int) dk;
569 dnc = dnc + dk;
570 }
571 if (dnc > INT_MAX) error(_("matrix would require %.0g columns"), dnc);
572 nc = (int) dnc;
573
574 /* Record which columns of the design matrix are associated */
575 /* with which model terms. */
576
577 PROTECT(assign = allocVector(INTSXP, nc));
578 k = 0;
579 if (intrcept) INTEGER(assign)[k++] = 0;
580 for (j = 0; j < nterms; j++) {
581 if(INTEGER(count)[j] <= 0)
582 warning(_("problem with term %d in model.matrix: no columns are assigned"),
583 j+1);
584 for (i = 0; i < INTEGER(count)[j]; i++)
585 INTEGER(assign)[k++] = j+1;
586 }
587
588
589 /* Create column labels for the matrix columns. */
590
591 PROTECT(xnames = allocVector(STRSXP, nc));
592
593
594 /* Here we loop over the terms in the model and, within each */
595 /* term, loop over the corresponding columns of the design */
596 /* matrix, assembling the names. */
597
598 /* FIXME : The body within these two loops should be embedded */
599 /* in its own function. */
600
601 k = 0;
602 if (intrcept)
603 SET_STRING_ELT(xnames, k++, mkChar("(Intercept)"));
604
605 for (j = 0; j < nterms; j++) {
606 if (j == rhs_response) continue;
607 for (kk = 0; kk < INTEGER(count)[j]; kk++) {
608 first = 1;
609 indx = kk;
610 bufp = &buf[0];
611 for (i = 0; i < nVar; i++) {
612 ll = INTEGER(factors)[i + j * nVar];
613 if (ll) {
614 var_i = VECTOR_ELT(variable, i);
615 if (!first)
616 bufp = AppendString(bufp, ":");
617 first = 0;
618 if (isFactor(var_i) || isLogical(var_i)) {
619 if (ll == 1) {
620 x = ColumnNames(VECTOR_ELT(contr1, i));
621 ll = ncols(VECTOR_ELT(contr1, i));
622 }
623 else {
624 x = ColumnNames(VECTOR_ELT(contr2, i));
625 ll = ncols(VECTOR_ELT(contr2, i));
626 }
627 addp = translateChar(STRING_ELT(vnames, i));
628 if(strlen(buf) + strlen(addp) < BUFSIZE)
629 bufp = AppendString(bufp, addp);
630 else
631 warning(_("term names will be truncated"));
632 if (x == R_NilValue) {
633 if(strlen(buf) + 10 < BUFSIZE)
634 bufp = AppendInteger(bufp, indx % ll + 1);
635 else
636 warning(_("term names will be truncated"));
637 } else {
638 addp = translateChar(STRING_ELT(x, indx % ll));
639 if(strlen(buf) + strlen(addp) < BUFSIZE)
640 bufp = AppendString(bufp, addp);
641 else
642 warning(_("term names will be truncated"));
643 }
644 } else if (isComplex(var_i)) {
645 error(_("complex variables are not currently allowed in model matrices"));
646 } else if(isNumeric(var_i)) { /* numeric */
647 x = ColumnNames(var_i);
648 ll = ncols(var_i);
649 addp = translateChar(STRING_ELT(vnames, i));
650 if(strlen(buf) + strlen(addp) < BUFSIZE)
651 bufp = AppendString(bufp, addp);
652 else
653 warning(_("term names will be truncated"));
654 if (ll > 1) {
655 if (x == R_NilValue) {
656 if(strlen(buf) + 10 < BUFSIZE)
657 bufp = AppendInteger(bufp, indx % ll + 1);
658 else
659 warning(_("term names will be truncated"));
660 } else {
661 addp = translateChar(STRING_ELT(x, indx % ll));
662 if(strlen(buf) + strlen(addp) < BUFSIZE)
663 bufp = AppendString(bufp, addp);
664 else
665 warning(_("term names will be truncated"));
666 }
667 }
668 } else
669 error(_("variables of type '%s' are not allowed in model matrices"),
670 type2char(TYPEOF(var_i)));
671 indx /= ll;
672 }
673 }
674 SET_STRING_ELT(xnames, k++, mkChar(buf));
675 }
676 }
677
678 /* Allocate and compute the design matrix. */
679
680 PROTECT(x = allocMatrix(REALSXP, n, nc));
681 double *rx = REAL(x);
682
683 #ifdef R_MEMORY_PROFILING
684 if (RTRACE(vars)){
685 memtrace_report(vars, x);
686 SET_RTRACE(x, 1);
687 }
688 #endif
689
690 /* a) Begin with a column of 1s for the intercept. */
691
692 if ((jnext = jstart = intrcept) != 0)
693 for (i = 0; i < n; i++)
694 rx[i] = 1.0;
695
696 /* b) Now loop over the model terms */
697
698 contrast = R_NilValue; /* -Wall */
699 for (k = 0; k < nterms; k++) {
700 if (k == rhs_response) continue;
701 for (i = 0; i < nVar; i++) {
702 if (INTEGER(columns)[i] == 0)
703 continue;
704 var_i = VECTOR_ELT(variable, i);
705 #ifdef R_MEMORY_PROFILING
706 if (RTRACE(var_i)){
707 memtrace_report(var_i, x);
708 SET_RTRACE(x, 1);
709 }
710 #endif
711 fik = INTEGER(factors)[i + k * nVar];
712 if (fik) {
713 switch(fik) {
714 case 1:
715 contrast = coerceVector(VECTOR_ELT(contr1, i), REALSXP);
716 break;
717 case 2:
718 contrast = coerceVector(VECTOR_ELT(contr2, i), REALSXP);
719 break;
720 }
721 PROTECT(contrast);
722 if (jnext == jstart) {
723 if (INTEGER(nlevs)[i] > 0) {
724 int adj = isLogical(var_i)?1:0;
725 // avoid overflow of jstart * nn PR#15578
726 firstfactor(&rx[jstart * nn], n, jnext - jstart,
727 REAL(contrast), nrows(contrast),
728 ncols(contrast), INTEGER(var_i), adj);
729 jnext = jnext + ncols(contrast);
730 }
731 else {
732 firstvar(&rx[jstart * nn], n, jnext - jstart,
733 REAL(var_i), n, ncols(var_i));
734 jnext = jnext + ncols(var_i);
735 }
736 }
737 else {
738 if (INTEGER(nlevs)[i] > 0) {
739 int adj = isLogical(var_i)?1:0;
740 addfactor(&rx[jstart * nn], n, jnext - jstart,
741 REAL(contrast), nrows(contrast),
742 ncols(contrast), INTEGER(var_i), adj);
743 jnext = jnext + (jnext - jstart)*(ncols(contrast) - 1);
744 }
745 else {
746 addvar(&rx[jstart * nn], n, jnext - jstart,
747 REAL(var_i), n, ncols(var_i));
748 jnext = jnext + (jnext - jstart) * (ncols(var_i) - 1);
749 }
750 }
751 UNPROTECT(1);
752 }
753 }
754 jstart = jnext;
755 }
756 PROTECT(tnames = allocVector(VECSXP, 2));
757 SET_VECTOR_ELT(tnames, 0, rnames);
758 SET_VECTOR_ELT(tnames, 1, xnames);
759 setAttrib(x, R_DimNamesSymbol, tnames);
760 setAttrib(x, install("assign"), assign);
761 UNPROTECT(14);
762 return x;
763 }
764 // modelmatrix()
765
766
767 /* updateform() : Update a model formula by the replacement of "." templates.
768 ---------------------------------------------------------------------------
769 */
770 static SEXP tildeSymbol = NULL;
771 static SEXP plusSymbol = NULL;
772 static SEXP minusSymbol = NULL;
773 static SEXP timesSymbol = NULL;
774 static SEXP slashSymbol = NULL;
775 static SEXP colonSymbol = NULL;
776 static SEXP powerSymbol = NULL;
777 static SEXP dotSymbol = NULL;
778 static SEXP parenSymbol = NULL;
779 static SEXP inSymbol = NULL;
780
ExpandDots(SEXP object,SEXP value)781 static SEXP ExpandDots(SEXP object, SEXP value)
782 {
783 SEXP op;
784
785 if (TYPEOF(object) == SYMSXP) {
786 if (object == dotSymbol)
787 object = duplicate(value);
788 return object;
789 }
790
791 if (TYPEOF(object) == LANGSXP) {
792 if (TYPEOF(value) == LANGSXP) op = CAR(value);
793 else op = NULL;
794 PROTECT(object);
795 if (CAR(object) == plusSymbol) {
796 if (length(object) == 2) {
797 SETCADR(object, ExpandDots(CADR(object), value));
798 }
799 else if (length(object) == 3) {
800 SETCADR(object, ExpandDots(CADR(object), value));
801 SETCADDR(object, ExpandDots(CADDR(object), value));
802 }
803 else goto badformula;
804 }
805 else if (CAR(object) == minusSymbol) {
806 if (length(object) == 2) {
807 if (CADR(object) == dotSymbol &&
808 (op == plusSymbol || op == minusSymbol))
809 SETCADR(object, lang2(parenSymbol,
810 ExpandDots(CADR(object), value)));
811 else
812 SETCADR(object, ExpandDots(CADR(object), value));
813 }
814 else if (length(object) == 3) {
815 if (CADR(object) == dotSymbol &&
816 (op == plusSymbol || op == minusSymbol))
817 SETCADR(object, lang2(parenSymbol,
818 ExpandDots(CADR(object), value)));
819 else
820 SETCADR(object, ExpandDots(CADR(object), value));
821 if (CADDR(object) == dotSymbol &&
822 (op == plusSymbol || op == minusSymbol))
823 SETCADDR(object, lang2(parenSymbol,
824 ExpandDots(CADDR(object), value)));
825 else
826 SETCADDR(object, ExpandDots(CADDR(object), value));
827 }
828 else goto badformula;
829 }
830 else if (CAR(object) == timesSymbol || CAR(object) == slashSymbol) {
831 if (length(object) != 3)
832 goto badformula;
833 if (CADR(object) == dotSymbol &&
834 (op == plusSymbol || op == minusSymbol))
835 SETCADR(object, lang2(parenSymbol,
836 ExpandDots(CADR(object), value)));
837 else
838 SETCADR(object, ExpandDots(CADR(object), value));
839 if (CADDR(object) == dotSymbol &&
840 (op == plusSymbol || op == minusSymbol))
841 SETCADDR(object, lang2(parenSymbol,
842 ExpandDots(CADDR(object), value)));
843 else
844 SETCADDR(object, ExpandDots(CADDR(object), value));
845 }
846 else if (CAR(object) == colonSymbol) {
847 if (length(object) != 3)
848 goto badformula;
849 if (CADR(object) == dotSymbol &&
850 (op == plusSymbol || op == minusSymbol ||
851 op == timesSymbol || op == slashSymbol))
852 SETCADR(object, lang2(parenSymbol,
853 ExpandDots(CADR(object), value)));
854 else
855 SETCADR(object, ExpandDots(CADR(object), value));
856 if (CADDR(object) == dotSymbol &&
857 (op == plusSymbol || op == minusSymbol))
858 SETCADDR(object, lang2(parenSymbol,
859 ExpandDots(CADDR(object), value)));
860 else
861 SETCADDR(object, ExpandDots(CADDR(object), value));
862 }
863 else if (CAR(object) == powerSymbol) {
864 if (length(object) != 3)
865 goto badformula;
866 if (CADR(object) == dotSymbol &&
867 (op == plusSymbol || op == minusSymbol ||
868 op == timesSymbol || op == slashSymbol ||
869 op == colonSymbol))
870 SETCADR(object, lang2(parenSymbol,
871 ExpandDots(CADR(object), value)));
872 else
873 SETCADR(object, ExpandDots(CADR(object), value));
874 if (CADDR(object) == dotSymbol &&
875 (op == plusSymbol || op == minusSymbol))
876 SETCADDR(object, lang2(parenSymbol,
877 ExpandDots(CADDR(object), value)));
878 else
879 SETCADDR(object, ExpandDots(CADDR(object), value));
880 }
881 else {
882 op = object;
883 while(op != R_NilValue) {
884 SETCAR(op, ExpandDots(CAR(op), value));
885 op = CDR(op);
886 }
887 }
888 UNPROTECT(1);
889 return object;
890 }
891 else return object;
892
893 badformula:
894 error(_("invalid formula in 'update'"));
895 return R_NilValue; /*NOTREACHED*/
896 }
897
updateform(SEXP old,SEXP new)898 SEXP updateform(SEXP old, SEXP new)
899 {
900 SEXP _new;
901
902 /* Always fetch these values rather than trying */
903 /* to remember them between calls. The overhead */
904 /* is minimal and we don't have to worry about */
905 /* intervening dump/restore problems. */
906
907 tildeSymbol = install("~");
908 plusSymbol = install("+");
909 minusSymbol = install("-");
910 timesSymbol = install("*");
911 slashSymbol = install("/");
912 colonSymbol = install(":");
913 powerSymbol = install("^");
914 dotSymbol = install(".");
915 parenSymbol = install("(");
916 inSymbol = install("%in%");
917
918 /* We must duplicate here because the */
919 /* formulae may be part of the parse tree */
920 /* and we don't want to modify it. */
921
922 PROTECT(_new = duplicate(new));
923
924 /* Check of new and old formulae. */
925 if (TYPEOF(old) != LANGSXP ||
926 (TYPEOF(_new) != LANGSXP && CAR(old) != tildeSymbol) ||
927 CAR(_new) != tildeSymbol)
928 error(_("formula expected"));
929
930 if (length(old) == 3) {
931 SEXP lhs = CADR(old);
932 SEXP rhs = CADDR(old);
933 /* We now check that new formula has a valid lhs.
934 If it doesn't, we add one and set it to the rhs of the old
935 formula. */
936 if (length(_new) == 2)
937 SETCDR(_new, CONS(lhs, CDR(_new)));
938 /* Now we check the left and right sides of the new formula
939 and substitute the correct value for any "." templates.
940 We must parenthesize the rhs or we might upset arity and
941 precedence. */
942 PROTECT(rhs);
943 SETCADR(_new, ExpandDots(CADR(_new), lhs));
944 SETCADDR(_new, ExpandDots(CADDR(_new), rhs));
945 UNPROTECT(1);
946 }
947 else {
948 /* The old formula had no lhs, so we only expand the rhs of the
949 new formula. */
950 SEXP rhs = CADR(old);
951 if (length(_new) == 3)
952 SETCADDR(_new, ExpandDots(CADDR(_new), rhs));
953 else
954 SETCADR(_new, ExpandDots(CADR(_new), rhs));
955 }
956
957 /* It might be overkill to zero the */
958 /* the attribute list of the returned */
959 /* value, but it can't hurt. */
960
961 SET_ATTRIB(_new, R_NilValue);
962 SET_OBJECT(_new, 0);
963 SEXP DotEnvSymbol = install(".Environment");
964 setAttrib(_new, DotEnvSymbol, getAttrib(old, DotEnvSymbol));
965
966 UNPROTECT(1);
967 return _new;
968 }
969
970
971 /*==========================================================================*/
972
973 /* termsform() & auxiliaries: Workhorse to turn model formula into terms() object
974 ------------------------------------------------------------------------------
975 */
976 #ifdef DEBUG_terms
977 # include <Print.h>
978 /* can use printVector(SEXP x, int indx, int quote)
979 * ~~~~~~~~~~~ ^^^^ ^^^^^
980 * to print R vector x[]; if(indx) print indices; if(quote) quote strings
981 * Print a "pair"List (with all *vector* components): */
printPList(SEXP form)982 static void printPList(SEXP form)
983 {
984 R_xlen_t n = 0;
985 SEXP cl;
986 for (cl = form, n = 0; cl != R_NilValue; cl = CDR(cl), n++) {
987 Rprintf(" L[%d] (|L[.]|=%d): ", n+1, length(CAR(cl)));
988 printVector(CAR(cl), 0, 0);
989 }
990 return;
991 }
992 static Rboolean trace_GetBit = TRUE;
993 static Rboolean trace_InstallVar = TRUE;
994 static int n_xVars; // nesting level: use for indentation
995 #endif
996
997 // word size in *bits* :
998 #define WORDSIZE (8*sizeof(int))
999
1000 // Global "State" Variables for terms() computation :
1001 // -------------------------------------------------
1002
1003 static int // 0/1 (Boolean) :
1004 intercept, // 1: have intercept term in the model
1005 parity, // +/- parity
1006 response; // 1: response term in the model
1007 static int nwords; /* # of words (ints) to code a term */
1008 static SEXP varlist; /* variables in the model */
1009 static PROTECT_INDEX vpi;
1010 static SEXP framenames; /* variables names for specified frame */
1011 static Rboolean haveDot; /* does RHS of formula contain `.'? */
1012
isZeroOne(SEXP x)1013 static int isZeroOne(SEXP x)
1014 {
1015 if (!isNumeric(x)) return 0;
1016 return (asReal(x) == 0.0 || asReal(x) == 1.0);
1017 }
1018
isZero(SEXP x)1019 static int isZero(SEXP x)
1020 {
1021 if (!isNumeric(x)) return 0;
1022 return asReal(x) == 0.0;
1023 }
1024
isOne(SEXP x)1025 static int isOne(SEXP x)
1026 {
1027 if (!isNumeric(x)) return 0;
1028 return asReal(x) == 1.0;
1029 }
1030
1031
1032 /* MatchVar determines whether two ``variables'' are */
1033 /* identical. Expressions are identical if they have */
1034 /* the same list structure and their atoms are identical. */
1035 /* This is just EQUAL from lisp. */
1036
1037 /* See src/main/memory.c: probably could be simplified to pointer comparison */
Seql2(SEXP a,SEXP b)1038 static int Seql2(SEXP a, SEXP b)
1039 {
1040 if (a == b) return 1;
1041 if (IS_CACHED(a) && IS_CACHED(b) && ENC_KNOWN(a) == ENC_KNOWN(b))
1042 return 0;
1043 else {
1044 const void *vmax = vmaxget();
1045 int result = !strcmp(translateCharUTF8(a), translateCharUTF8(b));
1046 vmaxset(vmax); /* discard any memory used by translateCharUTF8 */
1047 return result;
1048 }
1049 }
1050
MatchVar(SEXP var1,SEXP var2)1051 static int MatchVar(SEXP var1, SEXP var2)
1052 {
1053 /* For expedience, and sanity... */
1054 if ( var1 == var2 )
1055 return 1;
1056 /* Handle Nulls */
1057 if (isNull(var1) && isNull(var2))
1058 return 1;
1059 if (isNull(var1) || isNull(var2))
1060 return 0;
1061 /* Non-atomic objects - compare CARs & CDRs (and TAGs: PR#17235) */
1062 if ((isList(var1) || isLanguage(var1)) &&
1063 (isList(var2) || isLanguage(var2)))
1064 return MatchVar(CAR(var1), CAR(var2)) &&
1065 MatchVar(CDR(var1), CDR(var2)) &&
1066 MatchVar(TAG(var1), TAG(var2));
1067 /* Symbols */
1068 if (isSymbol(var1) && isSymbol(var2))
1069 return (var1 == var2);
1070 /* Literal Numerics */
1071 if (isNumeric(var1) && isNumeric(var2))
1072 return (asReal(var1) == asReal(var2));
1073 /* Literal Strings */
1074 if (isString(var1) && isString(var2))
1075 return Seql2(STRING_ELT(var1, 0), STRING_ELT(var2, 0));
1076 /* Nothing else matches */
1077 return 0;
1078 }
1079
1080
1081 /* InstallVar locates a ''variable'' in the model variable list
1082 adding it to the global varlist if not found. */
1083
1084 /* FIXME? This is used in two "almost orthogonal" situations:
1085 * 1) Install variable in varlist -- index is *not* used by caller:
1086 * InstallVar(.) called by ExtractVars() always w/o assigning result.
1087 * 2) Find the index of 'var' in 'varlist' (but do not change varlist,
1088 * as now *all* variables should have been assigned by ExtractVars());
1089 * called as 'whichBit = InstallVar(.)' *only* by AllocTermSetBit1()
1090 */
InstallVar(SEXP var)1091 static int InstallVar(SEXP var)
1092 {
1093 SEXP v;
1094 /* Check that variable is legitimate */
1095 if (!isSymbol(var) && !isLanguage(var) && !isZeroOne(var))
1096 error(_("invalid term in model formula"));
1097 /* Lookup/Install it */
1098 int indx = 0;
1099 for (v = varlist; CDR(v) != R_NilValue; v = CDR(v)) {
1100 indx++;
1101 if (MatchVar(var, CADR(v)))
1102 return indx;
1103 }
1104 #ifdef DEBUG_terms
1105 if(trace_InstallVar)
1106 Rprintf("InstallVar(%s): adding it to 'varlist' and return indx+1 = %d\n",
1107 CHAR(STRING_ELT(deparse1line(var, 0), 0)), indx + 1);
1108 #endif
1109 SETCDR(v, CONS(var, R_NilValue));
1110 return indx + 1;
1111 }
1112
1113
1114 /* If there is a dotsxp being expanded then we need to see
1115 whether any of the variables in the data frame match with
1116 the variable on the lhs. If so they shouldn't be included
1117 in the factors */
1118
CheckRHS(SEXP v)1119 static void CheckRHS(SEXP v)
1120 {
1121 while ((isList(v) || isLanguage(v)) && v != R_NilValue) {
1122 CheckRHS(CAR(v));
1123 v = CDR(v);
1124 }
1125 if (isSymbol(v)) {
1126 for (int i = 0; i < length(framenames); i++) {
1127 SEXP s = installTrChar(STRING_ELT(framenames, i));
1128 if (v == s) {
1129 SEXP t = allocVector(STRSXP, length(framenames) - 1);
1130 for (int j = 0; j < length(t); j++) {
1131 if (j < i)
1132 SET_STRING_ELT(t, j, STRING_ELT(framenames, j));
1133 else
1134 SET_STRING_ELT(t, j, STRING_ELT(framenames, j+1));
1135 }
1136 REPROTECT(framenames = t, vpi);
1137 }
1138 }
1139 }
1140 }
1141
1142
1143 /* ExtractVars() recursively extracts the variables in a model formula.
1144 * ------------- It calls InstallVar() to do the installation.
1145 * The code takes care of unary '+' and '-' . No checks are made
1146 * of the other ``binary'' operators. Maybe there should be some.
1147
1148 * NB: logic here is partly "repeated" in EncodeVars() --->> keep in sync !!
1149 */
ExtractVars(SEXP formula)1150 static void ExtractVars(SEXP formula)
1151 {
1152 #ifdef DEBUG_terms
1153 // for(int j=0; j < n_xVars; j++) Rprintf(" ");
1154 // indentation by (2 * n_xVars)
1155 Rprintf("%*d ExtractVars(%s): ", 2*n_xVars, n_xVars,
1156 CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1157 n_xVars++; // nesting level: use for indentation (by 2):
1158 int my_n_x = n_xVars; // so we can restore n_xVars before leaving
1159
1160 # define Prt_xtrVars(_S_) Rprintf(" case '%s'\n", _S_)
1161 # define Return n_xVars = my_n_x; return
1162 #else
1163 # define Prt_xtrVars(_S_)
1164 # define Return return
1165 #endif
1166
1167 if (isNull(formula) || isZeroOne(formula)) {
1168 Prt_xtrVars("Null or ZeroOne");
1169 Return;
1170 }
1171 if (isSymbol(formula)) {
1172 if (formula == dotSymbol) haveDot = TRUE;
1173 Prt_xtrVars(haveDot ? "isSym(<Dot>)" : "isSymbol()");
1174 if (haveDot && framenames != R_NilValue) {
1175 // install variables of the data frame:
1176 for (int i = 0; i < length(framenames); i++) {
1177 SEXP v = installTrChar(STRING_ELT(framenames, i));
1178 if (!MatchVar(v, CADR(varlist))) InstallVar(v);
1179 }
1180 } else
1181 InstallVar(formula);
1182 Return;
1183 }
1184 if (isLanguage(formula)) {
1185 if (CAR(formula) == tildeSymbol) {
1186 if (response)
1187 error(_("invalid model formula")); // more than one '~'
1188 if (isNull(CDDR(formula))) {
1189 Prt_xtrVars("isLanguage, tilde, *not* response");
1190 response = 0;
1191 ExtractVars(CADR(formula));
1192 }
1193 else {
1194 Prt_xtrVars("isLanguage, tilde, *response*");
1195 response = 1;
1196 InstallVar(CADR(formula));
1197 ExtractVars(CADDR(formula));
1198 }
1199 Return;
1200 }
1201 if (CAR(formula) == plusSymbol) { // '+'
1202 int len = length(formula);
1203 Prt_xtrVars("isLanguage, '+'");
1204 if (len > 1)
1205 ExtractVars(CADR(formula));
1206 if (len > 2) // binary
1207 ExtractVars(CADDR(formula));
1208 Return;
1209 }
1210 if (CAR(formula) == colonSymbol) {
1211 Prt_xtrVars("isLanguage, ':'");
1212 ExtractVars(CADR(formula));
1213 ExtractVars(CADDR(formula));
1214 Return;
1215 }
1216 if (CAR(formula) == powerSymbol) {
1217 Prt_xtrVars("isLanguage, '^'");
1218 if (!isNumeric(CADDR(formula)))
1219 error(_("invalid power in formula"));
1220 ExtractVars(CADR(formula));
1221 Return;
1222 }
1223 if (CAR(formula) == timesSymbol) {
1224 Prt_xtrVars("isLanguage, '*'");
1225 ExtractVars(CADR(formula));
1226 ExtractVars(CADDR(formula));
1227 Return;
1228 }
1229 if (CAR(formula) == inSymbol) {
1230 Prt_xtrVars("isLanguage, '%in%'");
1231 ExtractVars(CADR(formula));
1232 ExtractVars(CADDR(formula));
1233 Return;
1234 }
1235 if (CAR(formula) == slashSymbol) {
1236 Prt_xtrVars("isLanguage, '/'");
1237 ExtractVars(CADR(formula));
1238 ExtractVars(CADDR(formula));
1239 Return;
1240 }
1241 if (CAR(formula) == minusSymbol) { // '-'
1242 Prt_xtrVars("isLanguage, '-'");
1243 int len = length(formula);
1244 if (len == 2) { // unary
1245 ExtractVars(CADR(formula));
1246 }
1247 else {
1248 ExtractVars(CADR(formula));
1249 ExtractVars(CADDR(formula));
1250 }
1251 Return;
1252 }
1253 if (CAR(formula) == parenSymbol) {
1254 Prt_xtrVars("isLanguage, '('");
1255 ExtractVars(CADR(formula));
1256 Return;
1257 }
1258 // all other calls:
1259 Prt_xtrVars("_otherwise_");
1260 #ifdef DEBUG_terms
1261 Rprintf(" .. {%d} ExtractVars(f): installing formula f='%s'\n",
1262 n_xVars,
1263 CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1264 #endif
1265 InstallVar(formula);
1266 Return;
1267 }
1268 error(_("invalid model formula in ExtractVars"));
1269 }
1270
1271
1272 /* AllocTerm allocates an integer array for bit string representation
1273 * of a model term and initializes to __no bit set__ */
1274
AllocTerm(void)1275 static SEXP AllocTerm(void) // (<global> nwords)
1276 {
1277 SEXP term = allocVector(INTSXP, nwords); // caller must PROTECT
1278 memset(INTEGER(term), 0, nwords * sizeof(int));
1279 return term;
1280 }
1281
1282
1283 /* SetBit sets bit ``whichBit'' to value ``value''
1284 in the bit string representation of a term. */
1285
SetBit(SEXP term,int whichBit,int value)1286 static void SetBit(SEXP term, int whichBit, int value)
1287 {
1288 int
1289 word = (whichBit - 1) / WORDSIZE,
1290 offset = (- whichBit) % WORDSIZE;
1291 #ifdef DEBUG_terms
1292 Rprintf("SetBit(term, which=%3d, value=%d): |term|=%d, word= %d, offset= %2d\n",
1293 whichBit, value, length(term), word, offset);
1294 #endif
1295 if (value)
1296 ((unsigned *) INTEGER(term))[word] |= ((unsigned) 1 << offset);
1297 else
1298 ((unsigned *) INTEGER(term))[word] &= ~((unsigned) 1 << offset);
1299 }
1300
1301 /* 0. Install var (if needed)
1302 * 1a. Check if nwords is large enough for 'whichBit'
1303 * 1b. If not, increment nwords
1304 * 2. term = AllocTerm();
1305 * 3. SetBit(term, whichBit, 1);
1306 */
AllocTermSetBit1(SEXP var)1307 static SEXP AllocTermSetBit1(SEXP var) { // NB: caller must PROTECT
1308 int whichBit = InstallVar(var);
1309 if (nwords < (whichBit - 1)/WORDSIZE + 1)
1310 error("AllocT..Bit1(%s): Need to increment nwords to %d. Should not happen!\n",
1311 CHAR(STRING_ELT(deparse1line(var, 0), 0)),
1312 nwords+1);
1313 SEXP term = AllocTerm();
1314 SetBit(term, whichBit, 1);
1315 return term;
1316 }
1317
1318
1319 /* GetBit gets bit ``whichBit'' from the */
1320 /* bit string representation of a term. */
1321
GetBit(SEXP term,int whichBit)1322 static int GetBit(SEXP term, int whichBit)
1323 {
1324 int
1325 word = (whichBit - 1) / WORDSIZE,
1326 offset = (- whichBit) % WORDSIZE,
1327 word_off = INTEGER(term)[word] >> offset;
1328 #ifdef DEBUG_terms
1329 if(trace_GetBit) {
1330 Rprintf("GetBit(term,%3d): |term|=%2d, word=%d, offset= %2d --> bit= %d\n",
1331 whichBit, length(term), word, offset,
1332 word_off & 1);
1333 }
1334 #endif
1335 return word_off & 1;
1336 }
1337
1338
1339 /* OrBits computes a new (bit string) term */
1340 /* which contains the logical OR of the bits */
1341 /* in ``term1'' and ``term2''. */
1342
OrBits(SEXP term1,SEXP term2)1343 static SEXP OrBits(SEXP term1, SEXP term2)
1344 {
1345 SEXP term = AllocTerm();
1346 for (int i = 0; i < nwords; i++)
1347 INTEGER(term)[i] = INTEGER(term1)[i] | INTEGER(term2)[i];
1348 return term;
1349 }
1350
1351
1352 // BitCount counts the number of ``on'' bits in a term
BitCount(SEXP term,int nvar)1353 static int BitCount(SEXP term, int nvar)
1354 {
1355 int sum = 0;
1356 #ifdef DEBUG_terms
1357 Rprintf("BitCount(*, nvar=%d): ", nvar);
1358 #endif
1359 for (int i = 1; i <= nvar; i++)
1360 sum += GetBit(term, i);
1361 #ifdef DEBUG_terms
1362 Rprintf(" end{BitCount}\n");
1363 #endif
1364 return sum;
1365 }
1366
1367
1368 /* TermZero tests whether a (bit string) term is zero */
1369
TermZero(SEXP term)1370 static int TermZero(SEXP term)
1371 {
1372 for (int i = 0; i < nwords; i++)
1373 if (INTEGER(term)[i] != 0)
1374 return 0;
1375 return 1;
1376 }
1377
1378
1379 /* TermEqual tests two (bit string) terms for equality. */
1380
TermEqual(SEXP term1,SEXP term2)1381 static int TermEqual(SEXP term1, SEXP term2)
1382 {
1383 for (int i = 0; i < nwords; i++)
1384 if (INTEGER(term1)[i] != INTEGER(term2)[i])
1385 return 0;
1386 return 1;
1387 }
1388
1389
1390 /* StripTerm strips the specified term from the given list.
1391 This mutates the list (but the caller replaces it by 'root').
1392 Only called from DeleteTerms() i.e., " left - right "
1393 */
1394
StripTerm(SEXP term,SEXP list)1395 static SEXP StripTerm(SEXP term, SEXP list)
1396 {
1397 SEXP root = R_NilValue, prev = R_NilValue;
1398 if (TermZero(term))
1399 intercept = 0;
1400 while (list != R_NilValue) {
1401 if (TermEqual(term, CAR(list))) {
1402 if (prev != R_NilValue)
1403 SETCDR(prev, CDR(list));
1404 } else {
1405 if (root == R_NilValue)
1406 root = list;
1407 prev = list;
1408 }
1409 list = CDR(list);
1410 }
1411 return root;
1412 }
1413
1414
1415 /* TrimRepeats removes duplicates of (bit string) terms in a model formula.
1416 Also drops zero terms. */
1417
TrimRepeats(SEXP list)1418 static SEXP TrimRepeats(SEXP list)
1419 {
1420 // Drop zero terms at the start of the list.
1421 while (list != R_NilValue && TermZero(CAR(list))) {
1422 list = CDR(list);
1423 }
1424 if (list == R_NilValue || CDR(list) == R_NilValue)
1425 return list;
1426
1427 PROTECT(list);
1428 // Find out which terms are duplicates.
1429 SEXP all_terms = PROTECT(PairToVectorList(list)),
1430 duplicate_sexp = PROTECT(duplicated(all_terms, FALSE));
1431 int i_p1 = 1, *is_duplicate = LOGICAL(duplicate_sexp);
1432
1433 // Remove the zero terms and duplicates from the list.
1434 for (SEXP current = list; CDR(current) != R_NilValue; i_p1++) {
1435 SEXP next = CDR(current);
1436 if (is_duplicate[i_p1] || TermZero(CAR(next))) {
1437 // Remove the node from the list.
1438 SETCDR(current, CDR(next));
1439 } else {
1440 current = next;
1441 }
1442 }
1443
1444 UNPROTECT(3);
1445 return list;
1446 }
1447
1448
1449
1450 /*==========================================================================*/
1451
1452 /* Model Formula Manipulation */
1453 /* These functions take a numerically coded */
1454 /* formula and fully expand it. */
1455
1456 static SEXP EncodeVars(SEXP);/* defined below */
1457
1458
1459 /* PlusTerms expands ``left'' and ``right'' and */
1460 /* concatenates their terms (removing duplicates). */
1461
PlusTerms(SEXP left,SEXP right)1462 static SEXP PlusTerms(SEXP left, SEXP right)
1463 {
1464 PROTECT(left = EncodeVars(left));
1465 right = EncodeVars(right);
1466 UNPROTECT(1);
1467 return TrimRepeats(listAppend(left, right));
1468 }
1469
1470
1471 /* InteractTerms expands ``left'' and ``right'' */
1472 /* and forms a new list of terms containing the bitwise */
1473 /* OR of each term in ``left'' with each term in ``right''. */
1474
InteractTerms(SEXP left,SEXP right)1475 static SEXP InteractTerms(SEXP left, SEXP right)
1476 {
1477 SEXP term, l, r, t;
1478 PROTECT(left = EncodeVars(left));
1479 PROTECT(right = EncodeVars(right));
1480 PROTECT(term = allocList(length(left) * length(right)));
1481 t = term;
1482 for (l = left; l != R_NilValue; l = CDR(l))
1483 for (r = right; r != R_NilValue; r = CDR(r)) {
1484 SETCAR(t, OrBits(CAR(l), CAR(r)));
1485 t = CDR(t);
1486 }
1487 UNPROTECT(3);
1488 return TrimRepeats(term);
1489 }
1490
1491
1492 /* CrossTerms expands ``left'' and ``right'' */
1493 /* and forms the ``cross'' of the list of terms. */
1494 /* Duplicates are removed. */
1495
CrossTerms(SEXP left,SEXP right)1496 static SEXP CrossTerms(SEXP left, SEXP right)
1497 {
1498 SEXP term, l, r, t;
1499 PROTECT(left = EncodeVars(left));
1500 PROTECT(right = EncodeVars(right));
1501 PROTECT(term = allocList(length(left) * length(right)));
1502 t = term;
1503 for (l = left; l != R_NilValue; l = CDR(l))
1504 for (r = right; r != R_NilValue; r = CDR(r)) {
1505 SETCAR(t, OrBits(CAR(l), CAR(r)));
1506 t = CDR(t);
1507 }
1508 UNPROTECT(3);
1509 listAppend(right, term);
1510 listAppend(left, right);
1511 return TrimRepeats(left);
1512 }
1513
1514
1515 /* PowerTerms expands the ``left'' form and then */
1516 /* raises it to the power specified by the right term. */
1517 /* Allocation here is wasteful, but so what ... */
1518
PowerTerms(SEXP left,SEXP right)1519 static SEXP PowerTerms(SEXP left, SEXP right)
1520 {
1521 SEXP term, l, r, t;
1522 int i, ip;
1523 ip = asInteger(right);
1524 if (ip==NA_INTEGER || ip <= 1)
1525 error(_("invalid power in formula"));
1526 term = R_NilValue; /* -Wall */
1527 PROTECT(left = EncodeVars(left));
1528 right = left;
1529 for (i=1; i < ip; i++) {
1530 PROTECT(right);
1531 PROTECT(term = allocList(length(left) * length(right)));
1532 t = term;
1533 for (l = left; l != R_NilValue; l = CDR(l))
1534 for (r = right; r != R_NilValue; r = CDR(r)) {
1535 SETCAR(t, OrBits(CAR(l), CAR(r)));
1536 t = CDR(t);
1537 }
1538 UNPROTECT(2);
1539 right = TrimRepeats(term);
1540 }
1541 UNPROTECT(1);
1542 return term;
1543 }
1544
1545
1546 /* InTerms expands ``left'' and ``right'' and */
1547 /* forms the ``nest'' of the the left in the */
1548 /* interaction of the right */
1549
InTerms(SEXP left,SEXP right)1550 static SEXP InTerms(SEXP left, SEXP right)
1551 {
1552 PROTECT(left = EncodeVars(left));
1553 PROTECT(right = EncodeVars(right));
1554 SEXP t, term = PROTECT(AllocTerm());
1555 int *term_ = INTEGER(term);
1556 /* Bitwise or of all terms on right */
1557 for (t = right; t != R_NilValue; t = CDR(t)) {
1558 for (int i = 0; i < nwords; i++)
1559 term_[i] = term_[i] | INTEGER(CAR(t))[i];
1560 }
1561 /* Now bitwise or with each term on the left */
1562 for (t = left; t != R_NilValue; t = CDR(t))
1563 for (int i = 0; i < nwords; i++)
1564 INTEGER(CAR(t))[i] = term_[i] | INTEGER(CAR(t))[i];
1565 UNPROTECT(3);
1566 return TrimRepeats(left);
1567 }
1568
1569 /* NestTerms expands ``left'' and ``right'' */
1570 /* and forms the ``nest'' of the list of terms. */
1571 /* Duplicates are removed. */
1572
NestTerms(SEXP left,SEXP right)1573 static SEXP NestTerms(SEXP left, SEXP right)
1574 {
1575 PROTECT(left = EncodeVars(left));
1576 PROTECT(right = EncodeVars(right));
1577 SEXP t, term = PROTECT(AllocTerm());
1578 int *term_ = INTEGER(term);
1579 /* Bitwise or of all terms on left */
1580 for (t = left; t != R_NilValue; t = CDR(t)) {
1581 for (int i = 0; i < nwords; i++)
1582 term_[i] = term_[i] | INTEGER(CAR(t))[i];
1583 }
1584 /* Now bitwise or with each term on the right */
1585 for (t = right; t != R_NilValue; t = CDR(t))
1586 for (int i = 0; i < nwords; i++)
1587 INTEGER(CAR(t))[i] = term_[i] | INTEGER(CAR(t))[i];
1588 UNPROTECT(3);
1589 listAppend(left, right);
1590 return TrimRepeats(left);
1591 }
1592
1593
1594 /* DeleteTerms expands ``left'' and ``right'' */
1595 /* and then removes any terms which appear in */
1596 /* ``right'' from ``left''. */
1597
DeleteTerms(SEXP left,SEXP right)1598 static SEXP DeleteTerms(SEXP left, SEXP right)
1599 {
1600 PROTECT(left = EncodeVars(left)); parity = 1-parity;
1601 PROTECT(right = EncodeVars(right)); parity = 1-parity;
1602 for (SEXP t = right; t != R_NilValue; t = CDR(t))
1603 left = StripTerm(CAR(t), left);
1604 UNPROTECT(2);
1605 return left;
1606 }
1607
1608
1609 /* EncodeVars() performs model expansion and bit string encoding.
1610 * This is the real workhorse of model expansion in terms.formula() == termsform()
1611 *
1612 * Returns a (pair)list of length 'nterm' containing integer vectors [1:nwords],
1613 */
EncodeVars(SEXP formula)1614 static SEXP EncodeVars(SEXP formula)
1615 {
1616 if (isNull(formula)) return R_NilValue;
1617 else if (isOne(formula)) {
1618 intercept = (parity) ? 1 : 0; return R_NilValue;
1619 }
1620 else if (isZero(formula)) {
1621 intercept = (parity) ? 0 : 1; return R_NilValue;
1622 }
1623 // else :
1624 SEXP term;
1625 if (isSymbol(formula)) {
1626 if (formula == dotSymbol && framenames != R_NilValue) {
1627 /* prior to 1.7.0 this made term.labels in reverse order. */
1628 SEXP r = R_NilValue, v = R_NilValue; /* -Wall */
1629 if (!LENGTH(framenames)) return r;
1630 const void *vmax = vmaxget();
1631 for (int i = 0; i < LENGTH(framenames); i++) {
1632 /* change in 1.6.0 do not use duplicated names */
1633 const char *c = translateChar(STRING_ELT(framenames, i));
1634 for(int j = 0; j < i; j++)
1635 if(!strcmp(c, translateChar(STRING_ELT(framenames, j))))
1636 error(_("duplicated name '%s' in data frame using '.'"),
1637 c);
1638 term = AllocTermSetBit1(install(c));
1639 #ifdef DEBUG_terms
1640 Rprintf(".. in 'isSymbol(<dotSymbol>), after AllocT...1()\n");
1641 #endif
1642 if(i == 0) PROTECT(v = r = cons(term, R_NilValue));
1643 else {SETCDR(v, CONS(term, R_NilValue)); v = CDR(v);}
1644 }
1645 UNPROTECT(1);
1646 vmaxset(vmax);
1647 return r;
1648 }
1649 else {
1650 term = AllocTermSetBit1(formula);
1651 #ifdef DEBUG_terms
1652 Rprintf(".. in 'isSymbol(%s) [regular], after AllocT...1()\n",
1653 CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1654 #endif
1655 return CONS(term, R_NilValue);
1656 }
1657 }
1658 if (isLanguage(formula)) {
1659 if (CAR(formula) == tildeSymbol) {
1660 if (isNull(CDDR(formula)))
1661 return EncodeVars(CADR(formula));
1662 else
1663 return EncodeVars(CADDR(formula));
1664 }
1665 if (CAR(formula) == plusSymbol) {
1666 int len = length(formula);
1667 if (len == 2) // unary '+'
1668 return EncodeVars(CADR(formula));
1669 else
1670 return PlusTerms(CADR(formula), CADDR(formula));
1671 }
1672 if (CAR(formula) == colonSymbol) {
1673 return InteractTerms(CADR(formula), CADDR(formula));
1674 }
1675 if (CAR(formula) == timesSymbol) {
1676 return CrossTerms(CADR(formula), CADDR(formula));
1677 }
1678 if (CAR(formula) == inSymbol) {
1679 return InTerms(CADR(formula), CADDR(formula));
1680 }
1681 if (CAR(formula) == slashSymbol) {
1682 return NestTerms(CADR(formula), CADDR(formula));
1683 }
1684 if (CAR(formula) == powerSymbol) {
1685 return PowerTerms(CADR(formula), CADDR(formula));
1686 }
1687 if (CAR(formula) == minusSymbol) {
1688 int len = length(formula);
1689 if (len == 2) // unary '-'
1690 return DeleteTerms(R_NilValue, CADR(formula));
1691 return DeleteTerms(CADR(formula), CADDR(formula));
1692 }
1693 if (CAR(formula) == parenSymbol) {
1694 return EncodeVars(CADR(formula));
1695 }
1696 term = AllocTermSetBit1(formula);
1697 #ifdef DEBUG_terms
1698 Rprintf(".. before returning from EncodeVars(%s), after AllocT...1()\n",
1699 CHAR(STRING_ELT(deparse1line(formula, 0), 0)));
1700 #endif
1701 return CONS(term, R_NilValue);
1702 }
1703 error(_("invalid model formula in EncodeVars"));
1704 return R_NilValue;/*NOTREACHED*/
1705 }
1706
1707
1708 /* TermCode decides on the encoding of a model term. */
1709 /* Returns 1 if variable ``whichBit'' in ``thisTerm'' */
1710 /* is to be encoded by contrasts and 2 if it is to be */
1711 /* encoded by dummy variables. This is decided using */
1712 /* the heuristic described in Statistical Models in S, page 38. */
1713
1714 // called only in one place, in "step 4"
TermCode(SEXP termlist,SEXP thisterm,int whichbit,SEXP term)1715 static int TermCode(SEXP termlist, SEXP thisterm, int whichbit, SEXP term)
1716 {
1717 int *term_ = INTEGER(term),
1718 *th_t = INTEGER(CAR(thisterm));
1719 for (int i = 0; i < nwords; i++)
1720 term_[i] = th_t[i];
1721
1722 /* Eliminate factor ``whichbit'' */
1723 SetBit(term, whichbit, 0);
1724
1725 /* Search preceding terms for a match */
1726 /* Zero is a possibility - it is a special case */
1727
1728 int allzero = 1;
1729 for (int i = 0; i < nwords; i++) {
1730 if (term_[i]) {
1731 allzero = 0;
1732 break;
1733 }
1734 }
1735 if (allzero)
1736 return 1;
1737
1738 for (SEXP t = termlist; t != thisterm; t = CDR(t)) {
1739 allzero = 1;
1740 int *ct = INTEGER(CAR(t));
1741 for (int i = 0; i < nwords; i++)
1742 if (term_[i] & ~ct[i]) {
1743 allzero = 0; break;
1744 }
1745 if (allzero)
1746 return 1;
1747 }
1748 return 2;
1749 }
1750
1751
1752 /* Internal code for the ``terms'' function */
1753 /* The value is a formula with an assortment */
1754 /* of useful attributes. */
1755
1756 // R's terms.formula(x, specials, data, keep.order, allowDotAsName) in ../R/models.R
termsform(SEXP args)1757 SEXP termsform(SEXP args)
1758 {
1759 args = CDR(args); // (called via .External)
1760
1761 /* Always fetch these values rather than trying to remember them
1762 between calls. The overhead is minimal. */
1763
1764 tildeSymbol = install("~");
1765 plusSymbol = install("+");
1766 minusSymbol = install("-");
1767 timesSymbol = install("*");
1768 slashSymbol = install("/");
1769 colonSymbol = install(":");
1770 powerSymbol = install("^");
1771 dotSymbol = install(".");
1772 parenSymbol = install("(");
1773 inSymbol = install("%in%");
1774
1775 /* Do we have a model formula? <==> Check for unary or binary ~ */
1776
1777 if (!isLanguage(CAR(args)) ||
1778 CAR(CAR(args)) != tildeSymbol ||
1779 (length(CAR(args)) != 2 && length(CAR(args)) != 3))
1780 error(_("argument is not a valid model"));
1781
1782 haveDot = FALSE;
1783
1784 SEXP ans = PROTECT(duplicate(CAR(args)));
1785
1786 /* The formula will be returned, modified if haveDot becomes TRUE */
1787
1788 SEXP specials = CADR(args);
1789 if(length(specials) && !isString(specials))
1790 error(_("'specials' must be NULL or a character vector"));
1791 SEXP
1792 a = CDDR(args),
1793 /* We use data to get the value to substitute for "." in formulae */
1794 data = CAR(a);
1795 a = CDR(a);
1796 if (isNull(data) || isEnvironment(data))
1797 framenames = R_NilValue;
1798 else if (isFrame(data))
1799 framenames = getAttrib(data, R_NamesSymbol);
1800 else
1801 error(_("'data' argument is of the wrong type"));
1802 PROTECT_WITH_INDEX(framenames, &vpi);
1803
1804 Rboolean hadFrameNames = FALSE;
1805 if (framenames != R_NilValue) {
1806 if(length(framenames)) hadFrameNames = TRUE;
1807 if (length(CAR(args)) == 3)
1808 CheckRHS(CADR(CAR(args)));
1809 }
1810
1811 /* Preserve term order? */
1812 int keepOrder = asLogical(CAR(a));
1813 if (keepOrder == NA_LOGICAL)
1814 keepOrder = 0;
1815
1816 a = CDR(a);
1817 int allowDot = asLogical(CAR(a));
1818 if (allowDot == NA_LOGICAL) allowDot = 0;
1819
1820 // a := attributes(<answer>)
1821 a = allocList((specials == R_NilValue) ? 8 : 9);
1822 SET_ATTRIB(ans, a);
1823
1824 /* Step 1: Determine the ``variables'' in the model :
1825 * ------ Here we create a call of the form list(...).
1826 * You can evaluate it to get the model variables or use substitute
1827 * and then pull the result apart to get the variable names. */
1828
1829 intercept = 1;
1830 parity = 1;
1831 response = 0;
1832 PROTECT(varlist = LCONS(install("list"), R_NilValue));
1833 #ifdef DEBUG_terms
1834 n_xVars = 0; // the nesting level of ExtractVars()
1835 Rprintf("termsform(): calling ExtractVars(CAR(args), 1), where\n CAR(args)='%s'\n",
1836 CHAR(STRING_ELT(deparse1line(CAR(args), 0), 0)));
1837 #endif
1838 ExtractVars(CAR(args));
1839 //^^^^^^^^^ fills the 'varlist' = attr(<terms>, "variables")
1840 UNPROTECT(1);
1841 SETCAR(a, varlist);
1842 SET_TAG(a, install("variables"));
1843 a = CDR(a);
1844
1845 int nvar = length(varlist) - 1; /* Number of variables in the formula */
1846
1847 /* in allocating words need to allow for intercept term (PR#15735) */
1848 nwords = nvar/WORDSIZE + 1; // global; used & incremented in EncodeVars()
1849 #ifdef DEBUG_terms
1850 Rprintf(" .. after ExtractVars(): ini. nvar = %d, nwords = %d;%sCalling EncodeVars():%s",
1851 nvar, nwords,
1852 "\n------------------\n",
1853 "\n------------------\n");
1854 #endif
1855
1856 /* Step 2: Recode the model terms in binary form
1857 * ------ and at the same time, expand the model formula. */
1858
1859 /* FIXME: this includes 'specials' in the model */
1860 /* There perhaps needs to be a an extra pass */
1861 /* through the model to delete any terms which */
1862 /* contain specials. Actually, specials should */
1863 /* only enter additively so this should also be */
1864 /* checked and abort forced if not. */
1865
1866 /* BDR 2002-01-29: S does include specials, so code may rely on this */
1867
1868 /* FIXME: this is also the point where nesting needs to be taken care of. */
1869
1870 SEXP formula = PROTECT(EncodeVars(CAR(args)));
1871 // ^^^^^^^^^^
1872 if(length(varlist) != nvar + 1) {
1873 warning(_("'varlist' has changed (from nvar=%d) to new %d after EncodeVars() -- should no longer happen!"),
1874 nvar, length(varlist) - 1);
1875 nvar = length(varlist) - 1;
1876 }
1877 #ifdef DEBUG_terms
1878 Rprintf("after EncodeVars(): final (nvar,nwords) = (%d,%d); formula 'L' =\n", nvar, nwords);
1879 // Show the content of formula [[ Do we have integer(nwords) or just int.(1) ? ]]
1880 printPList(formula);
1881 #endif
1882
1883 /* Step 2a: Compute variable names */
1884
1885 SEXP v, call, varnames = PROTECT(allocVector(STRSXP, nvar));
1886 {
1887 R_xlen_t i;
1888 for (v = CDR(varlist), i = 0; v != R_NilValue; v = CDR(v))
1889 SET_STRING_ELT(varnames, i++,
1890 STRING_ELT(deparse1line(CAR(v), 0), 0));
1891 }
1892 #ifdef DEBUG_terms
1893 Rprintf("after Step 2a: variable names:\n"); printVector(varnames, 1, 1);
1894 #endif
1895
1896 /* Step 2b: Find and remove any offset(s) */
1897
1898 /* first see if any of the variables are offsets */
1899 R_xlen_t k = 0;
1900 for (R_xlen_t l = response; l < nvar; l++)
1901 if (!strncmp(CHAR(STRING_ELT(varnames, l)), "offset(", 7)) k++;
1902 if (k > 0) {
1903 #ifdef DEBUG_terms
1904 Rprintf(" step 2b: found k=%ld offset(.)s\n", k);
1905 #endif
1906 Rboolean foundOne = FALSE; /* has there been a non-offset term? */
1907 /* allocate the "offsets" attribute */
1908 SETCAR(a, v = allocVector(INTSXP, k));
1909 for (int l = response, k = 0; l < nvar; l++)
1910 if (!strncmp(CHAR(STRING_ELT(varnames, l)), "offset(", 7))
1911 INTEGER(v)[k++] = l + 1;
1912 SET_TAG(a, install("offset"));
1913 a = CDR(a);
1914 /* now remove offset terms from the formula */
1915 call = formula; /* call is to be the previous term once one is found */
1916 while (1) {
1917 SEXP thisterm = foundOne ? CDR(call) : call;
1918 Rboolean have_offset = FALSE;
1919 #ifdef DEBUG_terms
1920 Rprintf(" while (1) : foundOne = %d; length(thisterm) =%d; ",
1921 foundOne, length(thisterm));
1922 #endif
1923 if(length(thisterm) == 0) break;
1924 for (int i = 1; i <= nvar; i++)
1925 if (GetBit(CAR(thisterm), i) &&
1926 !strncmp(CHAR(STRING_ELT(varnames, i-1)), "offset(", 7)) {
1927 have_offset = TRUE;
1928 #ifdef DEBUG_terms
1929 Rprintf(" i=%d: have_offset, ", i);
1930 #endif
1931 break;
1932 }
1933 if (have_offset) {
1934 if (!foundOne) call = formula = CDR(formula);
1935 else SETCDR(call, CDR(thisterm));
1936 } else {
1937 if (foundOne) call = CDR(call);
1938 else foundOne = TRUE;
1939 }
1940 }
1941 }
1942 int nterm = length(formula); /* = #{model terms} */
1943 #ifdef DEBUG_terms
1944 Rprintf("after step 2: k=%ld, nterm:= |formula| = %d\n", k, nterm);
1945 // fails: 'formula' is "pairlist", not "vector" !!
1946 // Rprintf("formula[1..nterm=%d] = ", nterm); printVector(formula, 1, 0);
1947
1948 /* Step 3: Reorder the model terms by BitCount, otherwise
1949 * ------ preserving their order. */
1950
1951 Rprintf(" .. step 3 .. reorder model terms by BitCount():\n");
1952 #endif
1953 SEXP ord = PROTECT(allocVector(INTSXP, nterm)),
1954 pattern = PROTECT(allocVector(VECSXP, nterm));
1955 {
1956 SEXP sCounts = PROTECT(allocVector(INTSXP, nterm));
1957 int bitmax = 0,
1958 *iord = INTEGER(ord),
1959 *counts = INTEGER(sCounts);
1960 R_xlen_t n;
1961 for (call = formula, n = 0; call != R_NilValue; call = CDR(call), n++) {
1962 SET_VECTOR_ELT(pattern, n, CAR(call));
1963 counts[n] = BitCount(CAR(call), nvar);
1964 #ifdef DEBUG_terms
1965 int lc = length(CAR(call));
1966 Rprintf(" -> BitCount(CAR(call),nv) =:counts[n=%ld]=%2d, from bitpat.in int[%s%d]): ",
1967 n+1, counts[n], (lc==1)?"":"1:", lc);
1968 printVector(CAR(call), 0, 0);
1969 #endif
1970 }
1971 for (n = 0; n < nterm; n++)
1972 if(counts[n] > bitmax) bitmax = counts[n];
1973 #ifdef DEBUG_terms
1974 Rprintf("step 3 (part I): counts[1..nterm]: "); printVector(sCounts, 1, 0);
1975 Rprintf(" bitmax = max(counts[]) = %d\n", bitmax);
1976 #endif
1977
1978 if(keepOrder) {
1979 for (n = 0; n < nterm; n++)
1980 iord[n] = counts[n];
1981 } else {
1982 call = formula;
1983 int m = 0;
1984 for (int i = 0; i <= bitmax; i++) /* can order 0 occur? */
1985 for (n = 0; n < nterm; n++)
1986 if (counts[n] == i) {
1987 SETCAR(call, VECTOR_ELT(pattern, n));
1988 call = CDR(call);
1989 iord[m++] = i;
1990 }
1991 }
1992 UNPROTECT(2);
1993 }
1994 #ifdef DEBUG_terms
1995 Rprintf("after step 3: ord[1:nterm]: "); printVector(ord, 1, 0);
1996 Rprintf("--=--\n .. step 4: Create \"factors\" pattern matrix:\n");
1997 #endif
1998
1999 /* Step 4: Compute the factor pattern for the model. */
2000 /* 0 - the variable does not appear in this term. */
2001 /* 1 - code the variable by contrasts in this term. */
2002 /* 2 - code the variable by indicators in this term. */
2003
2004 if (nterm > 0) {
2005 SETCAR(a, pattern = allocMatrix(INTSXP, nvar, nterm));
2006 int *pattn = INTEGER(pattern);
2007 for (R_xlen_t i = 0; i < ((R_xlen_t) nterm) * nvar; i++)
2008 pattn[i] = 0;
2009 SEXP term = PROTECT(AllocTerm());
2010 R_xlen_t n_n = -1; // n = 0; ==> n_n = -1 + n*nvar = -1
2011 for (call = formula; call != R_NilValue; call = CDR(call)) {
2012 #ifdef DEBUG_terms
2013 Rprintf(" st.4: (bitpattern in int) term: "); printVector(CAR(call), 0,0);
2014 #endif
2015 for (int i = 1; i <= nvar; i++) {
2016 if (GetBit(CAR(call), i)) {
2017 #ifdef DEBUG_terms
2018 Rprintf(" var i=%d --> TermCode(., c., i, t.) {bit[i] := 0}\n", i);
2019 #endif
2020 pattn[i+n_n] = TermCode(formula, call, i, term);
2021 #ifdef DEBUG_terms
2022 Rprintf(" => pattn[%d,%d] = %d\n", (n_n+1)/nvar, i, pattn[i+n_n]);
2023 #endif
2024 }
2025 }
2026 n_n += nvar; // n++ ==> n_n = -1 + n*nvar
2027 }
2028 UNPROTECT(1);
2029 }
2030 else {
2031 SETCAR(a, pattern = allocVector(INTSXP,0));
2032 }
2033 SET_TAG(a, install("factors"));
2034 a = CDR(a);
2035
2036 #ifdef DEBUG_terms
2037 Rprintf(".. after step 4: filled \"factors\" matrix (nvar x nterm) = (%d x %d)\n",
2038 nvar, nterm);
2039 Rprintf("--=--\n .. step 5 .. computing term labels \"term.labels\":\n");
2040 #endif
2041
2042 /* Step 5: Compute term labels */
2043
2044 SEXP termlabs = PROTECT(allocVector(STRSXP, nterm));
2045 R_xlen_t n = 0;
2046 for (call = formula; call != R_NilValue; call = CDR(call)) {
2047 R_xlen_t l = 0;
2048 #ifdef DEBUG_terms
2049 Rprintf(" st.5: (bitpattern in int) term: "); printVector(CAR(call), 0, 0);
2050 Rprintf(" ---- {not tracing GetBit() when determing 'cbuf' length}\n");
2051 trace_GetBit = FALSE; // *not* tracing below
2052 #endif
2053 for (int i = 1; i <= nvar; i++) {
2054 if (GetBit(CAR(call), i)) {
2055 if (l > 0)
2056 l += 1;
2057 l += (int) strlen(CHAR(STRING_ELT(varnames, i - 1)));
2058 }
2059 }
2060 #ifdef DEBUG_terms
2061 trace_GetBit = TRUE; // back to tracing
2062 Rprintf(" --> cbuf length %d (+1 for final \\0)\n", l);
2063 #endif
2064 char cbuf[l+1];
2065 cbuf[0] = '\0';
2066 l = 0;
2067 for (int i = 1; i <= nvar; i++) {
2068 if (GetBit(CAR(call), i)) {
2069 if (l > 0)
2070 strcat(cbuf, ":");
2071 strcat(cbuf, CHAR(STRING_ELT(varnames, i - 1)));
2072 l++;
2073 }
2074 }
2075 SET_STRING_ELT(termlabs, n, mkChar(cbuf));
2076 n++;
2077 #ifdef DEBUG_terms
2078 Rprintf(" -> term.labels[%ld]: '%s'\n", n, cbuf);
2079 #endif
2080 }
2081
2082 #ifdef DEBUG_terms
2083 Rprintf(".. finished step 5: term.labels: "); printVector(termlabs, 1, /* quote */ 1);
2084 #endif
2085
2086 if (nterm > 0) { // dimnames("factors") <- ...
2087 PROTECT(v = allocVector(VECSXP, 2));
2088 SET_VECTOR_ELT(v, 0, varnames);
2089 SET_VECTOR_ELT(v, 1, termlabs);
2090 setAttrib(pattern, R_DimNamesSymbol, v);
2091 UNPROTECT(1);
2092 }
2093 SETCAR(a, termlabs);
2094 UNPROTECT(1); // termlabs
2095 SET_TAG(a, install("term.labels"));
2096 a = CDR(a);
2097
2098 /* If there are specials stick them in here */
2099
2100 if (specials != R_NilValue) {
2101 R_xlen_t j;
2102 const void *vmax = vmaxget();
2103 int i = length(specials);
2104 SEXP t;
2105 PROTECT(v = allocList(i));
2106 for (j = 0, t = v; j < i; j++, t = CDR(t)) {
2107 const char *ss = translateChar(STRING_ELT(specials, j));
2108 SET_TAG(t, install(ss));
2109 R_xlen_t n = (int) strlen(ss);
2110 SETCAR(t, allocVector(INTSXP, 0));
2111 R_xlen_t k = 0;
2112 for (R_xlen_t l = 0; l < nvar; l++) {
2113 if (!strncmp(CHAR(STRING_ELT(varnames, l)), ss, n))
2114 if (CHAR(STRING_ELT(varnames, l))[n] == '(')
2115 k++;
2116 }
2117 if (k > 0) {
2118 SETCAR(t, allocVector(INTSXP, k));
2119 k = 0;
2120 for (int l = 0; l < nvar; l++) {
2121 if (!strncmp(CHAR(STRING_ELT(varnames, l)), ss, n))
2122 if (CHAR(STRING_ELT(varnames, l))[n] == '('){
2123 INTEGER(CAR(t))[k++] = l+1;
2124 }
2125 }
2126 }
2127 else SETCAR(t, R_NilValue);
2128 }
2129 SETCAR(a, v);
2130 SET_TAG(a, install("specials"));
2131 a = CDR(a);
2132 UNPROTECT(1);
2133 vmaxset(vmax);
2134 }
2135
2136
2137 /* Step 6: Fix up the formula by substituting for dot, which should be
2138 the framenames joined by + */
2139
2140 if (haveDot) {
2141 if(length(framenames)) {
2142 SEXP rhs;
2143 PROTECT_INDEX ind;
2144 PROTECT_WITH_INDEX(rhs = installTrChar(STRING_ELT(framenames, 0)),
2145 &ind);
2146 for (R_xlen_t i = 1; i < LENGTH(framenames); i++) {
2147 REPROTECT(rhs = lang3(plusSymbol, rhs,
2148 installTrChar(STRING_ELT(framenames, i))),
2149 ind);
2150 }
2151 if (!isNull(CADDR(ans)))
2152 SETCADDR(ans, ExpandDots(CADDR(ans), rhs));
2153 else
2154 SETCADR(ans, ExpandDots(CADR(ans), rhs));
2155 UNPROTECT(1);
2156 } else if(!allowDot && !hadFrameNames) {
2157 error(_("'.' in formula and no 'data' argument"));
2158 }
2159 }
2160
2161 SETCAR(a, allocVector(INTSXP, nterm));
2162 {
2163 R_xlen_t n = 0;
2164 int *ia = INTEGER(CAR(a)), *iord = INTEGER(ord);
2165 for (call = formula; call != R_NilValue; call = CDR(call), n++)
2166 ia[n] = iord[n];
2167 }
2168
2169 SET_TAG(a, install("order"));
2170 a = CDR(a);
2171
2172 SETCAR(a, ScalarInteger(intercept != 0));
2173 SET_TAG(a, install("intercept"));
2174 a = CDR(a);
2175
2176 SETCAR(a, ScalarInteger(response != 0));
2177 SET_TAG(a, install("response"));
2178 a = CDR(a);
2179
2180 SETCAR(a, mkString("terms"));
2181 SET_TAG(a, R_ClassSymbol);
2182 SET_OBJECT(ans, 1);
2183
2184 SETCDR(a, R_NilValue); /* truncate if necessary */
2185
2186 UNPROTECT(5);
2187 return ans;
2188 }
2189 // termsform()
2190