1## $Id: sas.get.s 827 2012-10-21 13:13:34Z harrelfe $
2sas.get <-
3  function(libraryName,
4           member,
5           variables = character(0),
6           ifs = character(0),
7           format.library = libraryName,
8           id,
9           dates. = c("sas","yymmdd","yearfrac","yearfrac2"),
10           keep.log = TRUE,
11           log.file = "_temp_.log",
12           macro = sas.get.macro,
13           data.frame.out = existsFunction("data.frame"),
14           clean.up = FALSE,
15           quiet = FALSE,
16           temp = tempfile("SaS"),
17           formats=TRUE,
18           recode=formats,
19           special.miss=FALSE,
20           sasprog="sas",
21           as.is=.5,
22           check.unique.id=TRUE,
23           force.single=FALSE,
24           where,
25           uncompress=FALSE)
26{
27  if(force.single) stop('force.single does not work under R')
28  dates. <- match.arg(dates.)
29
30  fexists <- function(name) {
31    w <- file.exists(name)
32    attr(w, 'which') <- name[w]
33    w
34  }
35
36  file.is.dir <- function(name) {
37    isdir <- file.info(name)$isdir
38    isdir && !is.na(isdir)
39  }
40
41  file.is.readable <- function(name) file.access(name,4)==0
42
43  fileShow <- function(x) file.show(x)
44
45  if(recode) formats <- TRUE
46
47  if(missing(formats) || formats) {
48    ## *****  Next line begins mod from Mike Kattan edits 11 Sep 97
49    ## Redone FEH 22Oct00
50    no.format <- all(!fexists(file.path(format.library,
51                                        c('formats.sc2','formats.sct','formats.sct01','formats.sas7bcat'))))
52    if(no.format) {
53      if((!missing(formats) && formats) || (!missing(recode) && recode))
54        warning(paste(paste(format.library,
55                            "/formats.sc? or formats.sas7bcat",sep = ""),
56                      " not found. Formatting ignored. \n"))
57      formats <- recode <- FALSE
58    }
59    ## ***** End Mike Kattan edits 11 Sep 97
60  }
61
62  ## 5 Changes here from Claudie Berger <claudie@osteo1.ri.mgh.mcgill.ca> 19feb00
63  ## Allows work on sas v7.
64  sasin   <- paste(temp, ".3.sas", sep = "")
65  sasout1 <- paste(temp, ".1.sas", sep = "")
66  sasout2 <- paste(temp, ".2.sas", sep = "")
67  sasout3 <- paste(temp, ".4.sas", sep = "")
68  sasout4 <- paste(temp, ".5.sas", sep = "")
69  nvariables <- length(variables)
70  if(nvariables>0) {
71    if(any(jdup <- duplicated(variables)))
72      stop(paste("duplicate variables requested: ", variables[jdup]))
73  }
74
75  varstring <- paste(variables, collapse = "\n ")
76  ifs <- paste("'",paste(ifs, collapse = ";\n "),"'",sep="")
77  if(length(sasin) != 1)
78    stop("Illegal temporary file name")
79
80  temp.files <- c(sasin, sasout1, sasout2, sasout3, sasout4)
81  if(!keep.log)
82    temp.files <- c(temp.files, log.file)
83
84  if(clean.up)
85    on.exit(unlink(temp.files))
86  ##on.exit(sys(paste("rm -f", paste(temp.files, collapse = " "))))
87  ##  4oct03
88
89  if(missing(member))
90    stop("SAS member name is required")
91
92  if(missing(libraryName))
93    stop("SAS library name is required")
94
95  cat(macro, sep="\n", file=sasin)
96
97  sasds.suffix <- c('sd2','sd7','ssd01','ssd02','ssd03','ssd04','sas7bdat')
98  ## 22Oct00
99
100  if(libraryName == "") {
101    if(uncompress) {  # 22Oct00
102      unix.file <- paste(member, sasds.suffix, sep=".")
103      if(any(fe <- fexists(paste(unix.file,".gz",sep=""))))
104        system(paste("gunzip ",attr(fe,'which'),'.gz',sep=''))
105      else if(any(fe <- fexists(paste(unix.file,".Z",sep=""))))
106        system(paste("uncompress ",attr(fe,'which'),'.Z',sep=''))
107    }
108
109    cat("%sas_get(", member, ",\n",
110        "  ", sasout1, ",\n",
111        "  ", sasout2, ",\n",
112        "  ", sasout3, ",\n",
113        "  ", sasout4, ",\n",
114        "  dates=", dates., ",\n",
115        "  vars=",  varstring, ",\n",
116        "  ifs=",   ifs, ",\n",
117        "  formats=", as.integer(formats), "\n,",
118        "  specmiss=", as.integer(special.miss), ");\n",
119        file = sasin, append = TRUE, sep = "")
120  } else {
121    if(!file.is.dir(libraryName))
122      stop(paste(sep = "", "library, \"", libraryName,
123                 "\", is not a directory"))
124
125    unix.file <- file.path(libraryName, paste(member, sasds.suffix, sep="."))
126
127    ##23Nov00
128    if(uncompress) {  #22Oct00
129      if(any(fe <- fexists(paste(unix.file,".gz",sep=""))))
130        system(paste("gunzip ", attr(fe,'which'),'.gz',sep=''))
131      else if(any(fe <- fexists(paste(unix.file,".Z",sep=""))))
132        system(paste("uncompress ",attr(fe,'which'),'.Z',sep=''))
133    }
134
135    if(!any(fe <- fexists(unix.file))) {
136      stop(paste(sep = "", "Unix file, \"",
137                 paste(unix.file,collapse=' '),
138                 "\", does not exist"))
139    } else {
140      file.name <- attr(fe,'which')
141      if(!file.is.readable(file.name)) {
142        stop(paste(sep = "",
143                   "You do not have read permission for Unix file, \"",
144                   file.name, "\""))   # 22Oct00
145      }
146    }
147
148    cat("libname temp '", libraryName, "';\n", file = sasin, append = TRUE,
149        sep = "")
150
151    ## format.library should contain formats.sct containing user defined
152    ## formats used by this dataset.  It must be present.
153    cat("libname library '", format.library, "';\n", file = sasin,
154        append = TRUE, sep = "")
155    cat("%sas_get(temp.", member, ",\n",
156        "  ", sasout1, ",\n",
157        "  ", sasout2, ",\n",
158        "  ", sasout3, ",\n",
159        "  ", sasout4, ",\n",
160        "  dates=", dates., ",\n",
161        "  vars=",  varstring, ",\n",
162        "  ifs=",   ifs, ",\n",
163        "  formats=", as.integer(formats), "\n,",
164        "  specmiss=", as.integer(special.miss), ");\n",
165        file = sasin, append = TRUE, sep = "")
166  }
167
168  status <- system(paste(shQuote(sasprog), shQuote(sasin), "-log",
169                         shQuote(log.file)), intern=FALSE)
170  ## 24nov03 added output=F
171  if(status != 0) {
172    if(!quiet && fexists(log.file)) fileShow(log.file)  ## 4oct03
173    stop(paste("SAS job failed with status", status))
174  }
175  ##
176  ## Read in the variable information
177  ##
178  if(!(fexists(sasout1) && fexists(sasout2))) {
179    if(!quiet)
180      fileShow(log.file)  ## 4oct03
181
182    stop("SAS output files not found")
183  }
184
185  vars <-
186    scan(sasout1, list(name = "", type = 0, length = 0,
187                       format = "", label = "", n = 0),
188         multi.line = FALSE, sep = "\022",
189         flush=TRUE, comment.char='', quote='')
190  ## Thanks Don MacQueen for scan fix for R
191
192  nvar <- length(vars$name)
193  if(nvar == 0) {
194    if(!quiet)
195      fileShow(log.file)  ## 4oct03
196
197    stop("First SAS output is empty")
198  }
199
200  nrow <- vars$n[1]	#n is the same for each variable
201
202  ## Read the data in
203  ##  We try to be clever about the variable type.  If SAS is character
204  ##  use char of course.  If is numeric and length >4, use double.  If
205  ##  numeric and length <4, use single.  We could also use the format to
206  ##  choose further, if it consists of a number followed by a "."
207  ##  can we safely assume integer.
208  ##
209  type <- ifelse(vars$type == 2, "character(nrow)",
210                 ifelse(force.single,  ##28Mar01
211                        "single(nrow)", "double(nrow)"))
212  ##BILL: I corrected the macro so the following isn't needed:
213  ## get rid of trailing blank on names
214  ##	vars$name <- unix("sed 's/ $//'", vars$name)
215  inlist <- paste("\"", vars$name, "\"=", type,
216                  sep = "", collapse = ", ")
217
218  inlist <- parse(text = paste("list(", inlist, ")"))
219  ## Inlist would now be the size of the final data structure, if I had
220  ## evaluated it.
221
222  ## Read the data
223  ds <-
224    scan(sasout2, eval(inlist), sep = "\022", multi.line = FALSE,
225         flush=TRUE, comment.char='', quote='')
226
227  if(length(ds) < nvariables) {
228    m <- variables[is.na(match(variables, names(ds)))]
229    if(length(m) > 0) {
230      warning(paste(length(m),
231                    "requested variables did not exist:",
232                    paste("\"", m, "\"", sep = "", collapse = " "),
233                    "\n\t(use sas.contents())"))
234    }
235  }
236
237  format <- vars$format
238  format[format=='$'] <- ' '    # 1Mar00
239  label <- vars$label
240  name <- vars$name
241  esasout3 <- formats && fexists(sasout3)   #added formats && 1/20/93
242  if(recode && !esasout3) recode <- FALSE
243  FORMATS <- NULL
244
245  if(formats && esasout3) {
246    FORMATS <- dget(sasout3)
247    if(length(FORMATS)==0) {
248      FORMATS <- NULL;
249      recode <- FALSE
250    }
251  }
252
253  smiss <- NULL
254  if(special.miss && fexists(sasout4))
255    smiss <-
256      scan(sasout4, list(name="", code="", obs=integer(1)),
257           multi.line=FALSE, flush=TRUE, sep="\022",
258           comment.char='', quote='')
259
260  sasdateform <- c("date","mmddyy","yymmdd","ddmmyy","yyq","monyy",
261                   "julian","qtr","weekdate","weekdatx","weekday","month")
262  dateform <-
263    list(as.name("ddmmmyy"),"m/d/y","y/m/d","d/m/y",as.name("ddmmmyy"),
264         "mon year",as.name("ddmmmyy"),"mon",as.name("ddmmmyy"),
265         as.name("ddmmmyy"), as.name("ddmmmyy"),"m")
266
267  sastimeform <- c("hhmm","hour","mmss","time")
268  timeform <- c("h:m","h","m:s","h:m:s")
269  sasdatetimeform <- c("datetime","tod")
270  datetimeform <- list(list(as.name("ddmmmyy"),"h:m:s"), c("m/d/y"," "))
271  z <- "%02d%b%Y"
272  dateform4 <-
273    c(z,"%02m/%02d/%Y","%Y/%02m/%02d","%02d/%02m/%Y", z,"%02m %Y",
274      z,"%02m", z, z, z,"%02m")
275
276  timeform4 <- c("%02H:%02M","%02H","%02M:%02S","%02H:%02M:%02S")
277  datetimeform4 <- c("%02d%b%Y %02h:%02m:%02s","%02m/%02d/%Y")
278
279  ## Don MacQueen
280  days.to.adj <- as.numeric(difftime(ISOdate(1970,1,1,0,0,0) ,
281                                     ISOdate(1960,1,1,0,0,0), 'days'))
282  secs.to.adj <- days.to.adj*24*60*60
283
284  for(i in 1:nvar) {
285    atr <- list()
286    dsi <- ds[[i]]
287    fname <- format[i]
288    rec <- FALSE
289    if(fname!=" ") {
290      ff <- fname
291      if(dates.=="sas" & (m <- match(fname,sasdateform,0)) >0) {
292        ##look for partial dates
293        dd <- dsi-floor(dsi)
294        ddn <- !is.na(dd)
295        if(any(ddn) && any(dd[ddn]!=0)) {
296          ll <- 1:length(dd)
297          atr$partial.date <-
298            list(month=ll[dd==.5],day=ll[dd==.25],both=ll[dd==.75])
299          atr$imputed <- ll[dd!=0]
300          dsi <- floor(dsi)
301        }
302        dsi <- importConvertDateTime(dsi, 'date', 'sas',
303                                     form=dateform[m])
304
305        if(length(atr$imputed))
306          attr(dsi,'class') <- c("impute",attr(dsi,'class'))
307
308        ff <- NULL
309      } else {
310        if((m <- match(fname,sastimeform,0)) >0) {
311          dsi <- importConvertDateTime(dsi, 'time', 'sas',
312                                       form=timeform[m])
313          ff <- NULL
314        } else if((m <- match(fname,sasdatetimeform,0))>0) {
315          dsi <- importConvertDateTime(dsi, 'datetime', 'sas',
316                                       form=datetimeform[m])
317
318          ff <- NULL
319        }
320      }
321
322      atr$format <- ff
323      if(recode & length(g <- FORMATS[[fname]])) {
324        labs <- g$labels
325        if(!is.logical(recode)) {
326          labs <- if(recode==1) paste(g$values,":",labs,sep="")
327                  else paste(labs,"(",g$values,")",sep="")
328        }
329
330	dsi <- factor(dsi, g$values, labs)
331        atr$sas.codes <- g$values
332        rec <- TRUE
333      }
334    }
335
336    if(data.frame.out && !rec && vars$type[i]==2 &&
337       ((is.logical(as.is) && !as.is) ||
338        (is.numeric(as.is) && length(unique(dsi)) < as.is*length(dsi))))
339      dsi <- factor(dsi, exclude="") #exclude added 5Mar93
340
341    ## For data frames, char. var usually factors
342    if(label[i]!=" ")
343      label(dsi) <- label[i]  #atr$label <- label[i]
344
345    if(length(smiss$name)) {
346      j <- smiss$name==name[i]
347      if(any(j)) {
348        atr$special.miss <-
349          list(codes=smiss$code[j],obs=smiss$obs[j])
350        attr(dsi,'class') <- c("special.miss",attr(dsi,'class'))
351      }
352    }
353
354    if(!is.null(atr))
355      attributes(dsi) <- c(attributes(dsi),atr)
356
357    if(missing(where))
358      ds[[i]] <- dsi
359    else
360      assign(name[i], dsi, where=where)
361  }
362
363  if(!missing(where))
364    return(structure(where, class="where"))
365
366  atr <- list()
367
368  if(missing(id)) {
369    if(data.frame.out)
370      atr$row.names <- as.character(1:nrow)
371  } else {
372    idname <- id
373    jj <- match(idname, names(ds), 0)
374    if(any(jj==0))
375      stop(paste("id variable(s) not in dataset:",
376                 paste(idname[jj==0],collapse=" ")))
377
378    if(length(idname)==1) {
379      id <- ds[[idname]] #Need since not use data.frame
380    } else {
381      id <- as.character(ds[[idname[1]]])
382      for(jj in 2:length(idname))
383        id <- paste(id, as.character(ds[[idname[jj]]]))
384    }
385
386    if(check.unique.id) {
387      dup <- duplicated(id)
388      if(any(dup))
389        warning(paste("duplicate IDs:",
390                      paste(id[dup], collapse=" ")))
391    }
392
393    if(data.frame.out)
394      atr$row.names <- as.character(id)
395    else atr$id <- id
396  }
397
398  if(!is.null(FORMATS))
399    atr$formats <- FORMATS
400
401  if(data.frame.out)
402    atr$class <- "data.frame"
403
404  attributes(ds) <- c(attributes(ds),atr)
405  ds
406}
407
408importConvertDateTime <-
409  function(x, type=c('date','time','datetime'),
410           input=c('sas','spss','dataload'), form) {
411  type <- match.arg(type)
412  input <- match.arg(input)
413
414  if(input != 'sas' && type != 'date')
415    stop('only date variables are support for spss, dataload')
416
417  adjdays <- c(sas=3653, spss=141428, dataload=135080)[input]
418  ## 1970-1-1 minus 1960-1-1, 1582-10-14, or 1600-3-1
419  if(input=='spss') x <- x/86400
420
421  switch(type,
422         date = structure(x - adjdays, class='Date'),
423         time = {
424           ## Don MacQueen 3Apr02
425           z <- structure(x, class=c('POSIXt','POSIXct'))
426           f <- format(z, tz='GMT')
427           z <- as.POSIXct(format(z, tz='GMT'), tz='')
428           structure(z, class=c('timePOSIXt','POSIXt','POSIXct'))},
429         datetime = {
430           chron((x - adjdays*86400)/86400,
431                 out.format=c(dates='day mon year', times='h:m:s'))})
432}
433
434
435## Don MacQueen 3Apr02
436## slightly modified copy of format.POSIXct() from R base
437format.timePOSIXt <- function (x, format = "%H:%M:%S", tz = "",
438                               usetz = FALSE, ...) {
439  if (!inherits(x, c("timePOSIXt","POSIXct"))) stop("wrong class")
440  class(x) <- class(x)[-1]
441  structure(format.POSIXlt(as.POSIXlt(x, tz), format, usetz, ...),
442            names = names(x))
443}
444
445print.timePOSIXt <- function(x, ...) print(format(x, ...))
446
447
448##if(!.R.) {
449## Output format routine needed by chron for usual SAS date format
450ddmmmyy <- function(x)
451{
452  y <- month.day.year(trunc(oldUnclass(x)), attr(x,"origin"))
453  yr <- y$year
454  m <- c("Jan","Feb","Mar","Apr","May","Jun","Jul","Aug","Sep","Oct",
455         "Nov","Dec")[y$month]
456  ifelse(yr<1900 | yr>=2000, paste(y$day,m,yr,sep=""),
457         paste(y$day,m,yr-1900,sep=""))
458}
459
460
461## Functions to handle special.miss class
462is.special.miss <- function(x, code)
463{
464  sm <- attr(x, "special.miss")
465  if(!length(sm))
466    return(rep(FALSE, length(x)))
467
468  if(missing(code)) {
469    z <- rep(FALSE, length(x))
470    z[sm$obs] <- TRUE
471  } else {
472    z <- rep(FALSE, length(x))
473    z[sm$obs[sm$codes==code]] <- TRUE
474  }
475
476  z
477}
478
479
480"[.special.miss" <- function(x, ..., drop=FALSE)
481{
482  ats <- attributes(x)
483  ats$dimnames <- NULL
484  ats$dim <- NULL
485  ats$names <- NULL
486  attr(x,'class') <- NULL
487  y <- x[..., drop = drop]
488  if(length(y) == 0)
489    return(y)
490
491  k <- seq(along=x)
492  names(k) <- names(x)
493  k <- k[...]
494  attributes(y) <- c(attributes(y), ats)
495  smiss <- attr(y, "special.miss")
496  codes <- rep("ZZ",length(x))
497  codes[smiss$obs] <- smiss$codes
498  codes <- codes[...]
499  which <- codes!="ZZ"
500  if(sum(which)) attr(y,"special.miss") <-
501    list(obs=seq(along=k)[codes!="ZZ"],codes=codes[codes!="ZZ"])
502  else {
503    attr(y,"special.miss") <- NULL
504    attr(y,'class') <- attr(y,'class')[attr(y,'class') != "special.miss"]
505    if(length(attr(y,'class'))==0)
506      attr(y,'class') <- NULL
507  }
508
509  y
510}
511
512
513format.special.miss <- function(x, ...)
514{
515  w <-
516    if(is.factor(x))
517      as.character(x)
518    else {
519      cl <- attr(x,'class');
520      cl <- cl[cl!="special.miss"]
521      if(length(cl)) {
522        attr(x,'class') <- cl;
523        format(x, ...)
524      } else format.default(x, ...)
525    }
526
527  sm <- attr(x, "special.miss")
528  names(w) <- names(x)
529  if(!length(sm))
530    return(w)
531
532  w[sm$obs] <- sm$codes
533  attr(w,"label") <- attr(w,"special.miss") <- attr(w,"class") <- NULL
534  w
535}
536
537
538print.special.miss <- function(x, ...)
539{
540  sm <- attr(x, "special.miss")
541  if(!length(sm)) {
542    print.default(x)
543    return(invisible())
544  }
545
546  w <- format.special.miss(x)
547  print.default(w, quote=FALSE)
548  invisible()
549}
550
551
552sas.codes <- function(object) attr(object, "sas.codes")
553
554
555code.levels <- function(object) {
556  if(length(cod <- attr(object,"sas.codes")))
557    levels(object) <- paste(cod,":",levels(object),sep="")
558
559  object
560}
561
562
563as.data.frame.special.miss <- function(x, row.names = NULL, optional = FALSE, ...)
564{
565  nrows <- length(x)
566  if(is.null(row.names)) {
567    ## the next line is not needed for the 1993 version of data.class and is
568    ## included for compatibility with 1992 version
569    if(length(row.names <- names(x)) == nrows &&
570       !any(duplicated(row.names))) {
571    }
572    else if(optional)
573      row.names <- character(nrows)
574    else row.names <- as.character(1:nrows)
575  }
576
577  value <- list(x)
578  if(!optional)
579    names(value) <- deparse(substitute(x))[[1]]
580
581  structure(value, row.names=row.names, class='data.frame')
582}
583
584
585## val{nval}=compress(value)||"" was =value  23mar04
586sas.get.macro <-
587  c("/* Macro sas_get (modified by F. Harrell 30Jan90, Bill Dunlap Dec90, FH Mar92,",
588    "\t\t\tFH Apr95 (extend LENGTH smiss))",
589    "    Sets up for conversion of SAS dataset to S dataset.",
590    "    Arguments:", "\tdataset - name of SAS dataset",
591    "\ttemp1\t- Name of temporary dataset to contain data dictionar (unquoted)",
592    "\t\t  default=/tmp/file.1",
593    "\ttemp2\t- Name of temporary dataset to contain ASCII version of SAS",
594    "\t\t  dataset (unquoted)", "\t\t  default=/tmp/file.2",
595    "\ttemp3   - Name of temporary dataset to contain ASCII file with S",
596    "\t\t  program to store format values and labels",
597    "\ttemp4   - Name of temporary dataset to contain ASCII file with",
598    "\t\t  locations of special missing values",
599    "\tdates\t- SAS to store date variables in SAS format ( # days from 1/1/60)",
600    "\t\t  (default)",
601    "\t\t- YEARFRAC to store as days from 1/1/1900, divided by 365.25",
602    "\t\t- YEARFRAC2 to store as year + fraction of current year",
603    "\t\t- YYMMDD to store as numeric YYMMDD",
604    "\tvars    - list of variable in dataset that you want returned to S",
605    "                  (unquoted, separate variable names with spaces)  If empty,",
606    "                  then return all variables.",
607    "        ifs     - sequence of SAS subsetting if statements, (unquoted,",
608    "                  separated by semicolons).",
609    "\tformats - 0 (default) - do not create file on temp3 containing S",
610    "\t\t  statements to store format values and labels, 1 do create",
611    "\tspecmiss- 0 (default).  Set to 1 to write a data file on temp4 with",
612    "\t\t  the fields: variable name, special missing value code,",
613    "\t\t  observation number",
614    "                                                                              */",
615    "%macro sas_get(dataset,  temp1, temp2, temp3, temp4, dates=SAS, vars=, ifs=, ",
616    "\tformats=0, specmiss=0);",
617    "OPTIONS NOFMTERR;",
618    "%IF %QUOTE(&temp1)=  %THEN %LET temp1=/tmp/file.1;",
619    "%IF %QUOTE(&temp2)=  %THEN %LET temp2=/tmp/file.2;",
620    "%IF %QUOTE(&temp3)=  %THEN %LET temp3=/tmp/file.3;",
621    "%IF %QUOTE(&temp4)=  %THEN %LET temp4=/tmp/file.4;",
622    ## Next line had %QUOTE(&ifs),1,\"'\"  31oct02
623    "%LET dates=%UPCASE(&dates);", "%LET ifs=%SCAN(%QUOTE(&ifs),1,'');",
624    "%LET _s_=_sav_;",
625    "/* BILL: Can these 2 subsets be combined into one pass of the data? -Frank*/",
626    "/* Subset by observation first */", "%IF %QUOTE(&ifs)^= %THEN %DO;",
627    " data _osub_ ;", "  set &dataset ;", "  &ifs ;",
628    " %LET dataset=_osub_ ;", " %END;", "/* Then subset by variable */",
629    "%IF &vars^= %THEN %DO;", " data _vsub_ ;", "  set &dataset ;",
630    "  keep &vars ;", " %LET dataset=_vsub_ ;", " %END;",
631    "proc contents data=&dataset out=&_s_(KEEP=name type length label format nobs ",
632    " varnum) noprint; ", "%IF &formats=1 %THEN %DO;",
633    "   PROC FORMAT LIBRARY=LIBRARY CNTLOUT=f(KEEP=fmtname type start end label);",
634    "   DATA f; SET f; RETAIN n 0; n+1; IF type=\"C\" THEN fmtname=\"$\"||fmtname;",
635    "   PROC SORT DATA=f OUT=f(DROP=n); BY fmtname n; ",
636    "  *Sort by n instead of start for numerics so 13 sorts after 2;",
637    "  *Dont consider formats containing ANY range of values;",
638    "  *Dont consider formats that dont have at least one non-missing (if",
639    "   numeric) starting value.  This gets rid of formats that are used",
640    "   only to label special missing values;",
641    "   DATA f2; SET f; BY fmtname; RETAIN anyrange 0 anynmiss 0;",
642    "      IF FIRST.fmtname THEN DO;anyrange=0;anynmiss=0;END;",
643    "      IF start^=end THEN anyrange=1;",
644    "      IF TYPE=\"C\" THEN anynmiss=1; ",
645    "      ELSE IF (start+0)>. THEN anynmiss=1;",
646    "      IF LAST.fmtname & anynmiss & ^anyrange THEN OUTPUT; KEEP fmtname;",
647    "   DATA f; MERGE f f2(IN=in2); BY fmtname; IF in2;",
648    "      IF TYPE=\"N\" THEN DO; IF (start+0)>.;  *S cannot handle special missings;",
649    "         END;", "      RENAME fmtname=format start=value; DROP end;",
650    "   PROC SORT DATA=&_s_(KEEP=format) OUT=sform; BY format;",
651    "   DATA sform; SET sform; BY format; IF LAST.format;",
652    "   DATA f; MERGE sform(IN=in1) f(IN=in2); BY format; ",
653    "      IF in1 & in2;",
654    "   *This keeps formats ever used by any variable;",
655    "   DATA _NULL_; SET f END=_eof_; BY format;",
656    "      ARRAY val{*} $ 16 val1-val500; ARRAY lab{*} $ 40 lab1-lab500; ",
657    "      RETAIN done 0 nform 0 nval 0 val1-val500 \" \" lab1-lab500 \" \" bk -1; ",
658    "      FILE \"&temp3\" LRECL=4096;", "      IF FIRST.format THEN DO;",
659    "         IF ^done THEN PUT 'list(' @@;  done=1;",
660    "         nform=nform+1; nval=0;",
661    "         format=TRANSLATE(format,\".abcdefghijklmnopqrstuvwxyz\",",
662    "                                 \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");",
663    "          IF nform=1 THEN PUT '\"' format +bk '\"=list(' @@;",
664    "         ELSE PUT ', \"' format +bk '\"=list(' @@;", "         END;",
665    "      nval=nval+1; ",
666    "      IF nval>500 THEN DO; ERROR \">500 format values not allowed\";ABORT ABEND;",
667    "         END;", '      val{nval}=compress(value)||""; lab{nval}=label; ',
668    "      IF LAST.format THEN DO;", "         PUT \"values=c(\" @@; ",
669    "         DO i=1 TO nval; IF i>1 THEN PUT \",\" @@;",
670    "            IF type=\"N\" THEN PUT val{i} +bk @@;",
671    "            ELSE PUT '\"' val{i} +bk '\"'  @@;", "            END;",
672    "         PUT \"),labels=c(\" @@;",
673    "         DO i=1 TO nval; IF i>1 THEN PUT \",\" @@;",
674    "            PUT '\"' lab{i} +bk '\"' @@;", "            END;",
675    "         PUT \"))\";", "         END;",
676    "      IF _eof_ THEN PUT \")\";", "   %END;",
677    "PROC SORT DATA=&_s_;BY varnum;", "data _null_;", " set &_s_ end=eof;",
678    " FILE \"&temp1\";  RETAIN _bk_ -1;", " if _n_ = 1 then do;",
679    "%IF &specmiss=0 %THEN %LET ofile=_NULL_; ",
680    "%ELSE %LET ofile=smiss(KEEP=vname val obs);",
681    "  put \"data &ofile; set &dataset end=eof;\";",
682    "  put '  file \"&temp2\" RECFM=D LRECL=4096;';",
683    "  put \"  retain __delim 18 _bk_ -1 obs 0; LENGTH _xx_ $ 20 obs 5;obs+1; \";",
684    "%IF &specmiss=1 %THEN %DO;",
685    "  put \"LENGTH vname $ 8 val $ 1;\"; %END;", "  end;",
686    " IF type=2 THEN DO;", "  PUT 'FORMAT ' name ';' @;",
687    "  PUT 'IF ' name '=\" \" THEN PUT __delim IB1. @;';",
688    "/* $char added F.H. 24Mar92, dropped  +_bk_ before __delim */",
689    "/* $CHAR. removed FEH 2Aug92, added null FORMAT above, added back +_bk_ */",
690    "  PUT 'ELSE PUT ' name '+_bk_ __delim IB1. @;';", "  END;",
691    " ELSE DO; ", "  PUT 'IF ' name '<=.Z THEN _xx_=\"NA\";' @;",
692    "  PUT 'ELSE _xx_=LEFT(PUT(' @;", "  format=UPCASE(format);",
693    "  IF format=\"DATE\"|format=\"MMDDYY\"|format=\"YYMMDD\"|",
694    "format=\"DDMMYY\"|format=\"YYQ\"|format=\"MONYY\"|format=\"JULIAN\" THEN DO;",
695    "   %IF &dates=SAS %THEN", "    PUT name \",BEST18.)\";",
696    "   %ELSE %IF &dates=YYMMDD %THEN", "    PUT name \",YYMMDD6.)\";",
697    "   %ELSE %IF &dates=YEARFRAC %THEN",
698    "    PUT \"(\" name \"-MDY(1,1,1900))/365.25,7.3)\";",
699    "   %ELSE %IF &dates=YEARFRAC2 %THEN %DO;",
700    "    PUT \"YEAR(\" name \")-1900+(\" name \"-MDY(1,1,YEAR(\" name \")))/\" @;",
701    "    PUT \"(MDY(12,31,YEAR(\" name \"))-MDY(1,1,YEAR(\" name \"))+1),7.3)\";",
702    "    %END;", "   ;", "   END;\t",
703    "  ELSE DO;PUT name \",BEST18.)\" @;END;",
704    "  PUT ');  PUT _xx_ +_bk_ __delim IB1. @;';  *Added +_bk_ 2Aug92;",
705    "%IF &specmiss=1 %THEN %DO;",
706    "  put 'IF .A<=' name '<=.Z THEN DO;",
707    "   vname=\"' name +_bk_ '\"; val=put(' name ',1.); OUTPUT; END;';",
708    "  %END;", "  END;", "if eof then PUT 'PUT; RUN;';", "run;",
709    "%include \"&temp1\";", "data _null_; set &_s_;",
710    " retain __delim 18 _bk_ -1; ", " file \"&temp1\" LRECL=4096;",
711    " name=TRANSLATE(name,\".abcdefghijklmnopqrstuvwxyz\",",
712    "\t\t     \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");",
713    " format=TRANSLATE(format,\".abcdefghijklmnopqrstuvwxyz\",",
714    "                         \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");",
715    " put name +_bk_ __delim IB1. type +_bk_ __delim IB1. length +_bk_ __delim IB1.",
716    "  format +_bk_ __delim IB1. label +_bk_ __delim IB1. nobs +_bk_ __delim IB1.;",
717    "run;", "%IF &specmiss=1 %THEN %DO;",
718    " PROC SORT DATA=smiss OUT=smiss;BY vname val obs;",
719    " DATA _NULL_; SET smiss;FILE \"&temp4\" RECFM=D LRECL=30;",
720    " RETAIN _bk_ -1 __delim 18;",
721    " vname=TRANSLATE(vname,\".abcdefghijklmnopqrstuvwxyz\",",
722    "\t\t       \"_ABCDEFGHIJKLMNOPQRSTUVWXYZ\");",
723    " PUT vname +_bk_ __delim IB1. val +_bk_ __delim IB1. obs +_bk_ __delim IB1.;",
724    " RUN;", " %END;", "%mend sas_get;")
725
726cleanup.import <-
727  function(obj, labels=NULL, lowernames=FALSE,
728           force.single=TRUE, force.numeric=TRUE,
729           rmnames=TRUE,
730           big=1e20, sasdict,
731           pr=prod(dimobj) > 5e5,
732           datevars=NULL, datetimevars=NULL,
733           dateformat='%F', fixdates=c('none','year'),
734           charfactor=FALSE)
735{
736  fixdates <- match.arg(fixdates)
737  nam <- names(obj)
738  dimobj <- dim(obj)
739  nv <- length(nam)
740
741  if(!missing(sasdict))
742    {
743      sasvname <- makeNames(sasdict$NAME)
744      if(any(w <- nam %nin% sasvname))
745        stop(paste('The following variables are not in sasdict:',
746                   paste(nam[w],collapse=' ')))
747
748      saslabel <- structure(as.character(sasdict$LABEL),
749                            names=as.character(sasvname))
750      labels <- saslabel[nam]
751      names(labels) <- NULL
752    }
753
754  if(length(labels) && length(labels) != dimobj[2])
755    stop('length of labels does not match number of variables')
756
757  if(lowernames)
758    names(obj) <- casefold(nam)
759
760  if(pr)
761    cat(dimobj[2],'variables; Processing variable:')
762
763  for(i in 1:dimobj[2])
764    {
765      if(pr) cat(i,'')
766
767      x <- obj[[i]];
768      modif <- FALSE
769      if(length(dim(x)))
770        next
771
772      if(rmnames)
773        {
774          if(length(attr(x,'names')))
775            {
776              attr(x,'names') <- NULL
777              modif <- TRUE
778            } else if(length(attr(x,'.Names')))
779              {
780                attr(x,'.Names') <- NULL
781                modif <- TRUE
782              }
783        }
784
785      if(length(attr(x,'Csingle'))) {
786        attr(x,'Csingle') <- NULL
787        modif <- TRUE
788      }
789
790    if(length(c(datevars,datetimevars)) &&
791       nam[i] %in% c(datevars,datetimevars) &&
792       !all(is.na(x))) {
793      if(!(is.factor(x) || is.character(x)))
794        stop(paste('variable',nam[i],
795                   'must be a factor or character variable for date conversion'))
796
797      x <- as.character(x)
798      ## trim leading and trailing white space
799      x <- sub('^[[:space:]]+','',sub('[[:space:]]+$','', x))
800      xt <- NULL
801      if(nam[i] %in% datetimevars) {
802        xt <- gsub('.* ([0-9][0-9]:[0-9][0-9]:[0-9][0-9])','\\1',x)
803        xtnna <- setdiff(xt, c('',' ','00:00:00'))
804        if(!length(xtnna)) xt <- NULL
805        x <- gsub(' [0-9][0-9]:[0-9][0-9]:[0-9][0-9]','',x)
806      }
807      if(fixdates != 'none') {
808        if(dateformat %nin% c('%F','%y-%m-%d','%m/%d/%y','%m/%d/%Y'))
809          stop('fixdates only supported for dateformat %F %y-%m-%d %m/%d/%y %m/%d/%Y')
810
811        x <- switch(dateformat,
812                    '%F'      =gsub('^([0-9]{2})-([0-9]{1,2})-([0-9]{1,2})', '20\\1-\\2-\\3',x),
813                    '%y-%m-%d'=gsub('^[0-9]{2}([0-9]{2})-([0-9]{1,2})-([0-9]{1,2})', '\\1-\\2-\\3',x),
814                    '%m/%d/%y'=gsub('^([0-9]{1,2})/([0-9]{1,2})/[0-9]{2}([0-9]{2})', '\\1/\\2/\\3',x),
815                    '%m/%d/%Y'=gsub('^([0-9]{1,2})/([0-9]{1,2})/([0-9]{2})$','\\1/\\2/20\\3',x))
816      }
817      x <- if(length(xt)) {
818        require('chron')
819        cform <- if(dateformat=='%F') 'y-m-d'
820        else gsub('%','',tolower(dateformat))
821        chron(x, xt, format=c(dates=cform,times='h:m:s'))
822      }
823      else as.Date(x, format=dateformat)
824      modif <- TRUE
825    }
826
827      if(length(labels)) {
828        label(x) <- labels[i]
829        modif <- TRUE
830      }
831
832      if(force.numeric && length(lev <- levels(x))) {
833        if(all.is.numeric(lev)) {
834          labx <- attr(x,'label')
835          x <- as.numeric(as.character(x))
836          label(x) <- labx
837          modif <- TRUE
838        }
839      }
840
841      if(storage.mode(x) == 'double') {
842        xu <- oldUnclass(x)
843        j <- is.infinite(xu) | is.nan(xu) | abs(xu) > big
844        if(any(j,na.rm=TRUE)) {
845          x[j] <- NA
846          modif <- TRUE
847          if(pr)
848            cat('\n')
849
850          cat(sum(j,na.rm=TRUE),'infinite values set to NA for variable',
851              nam[i],'\n')
852        }
853
854        isdate <- testDateTime(x)
855        if(force.single && !isdate) {
856          allna <- all(is.na(x))
857          if(allna) {
858            storage.mode(x) <- 'integer'
859            modif <- TRUE
860          }
861
862          if(!allna) {
863            notfractional <- !any(floor(x) != x, na.rm=TRUE)
864            if(max(abs(x),na.rm=TRUE) <= (2^31-1) && notfractional) {
865              storage.mode(x) <- 'integer'
866              modif <- TRUE
867            }
868          }
869        }
870      }
871
872      if(charfactor && is.character(x)) {
873        if(length(unique(x)) < .5*length(x)) {
874          x <- sub(' +$', '', x)  # remove trailing blanks
875          x <- factor(x, exclude='')
876          modif <- TRUE
877        }
878      }
879
880      if(modif) obj[[i]] <- x
881      NULL
882    }
883
884  if(pr) cat('\n')
885  if(!missing(sasdict)) {
886    sasat <- sasdict[1,]
887    attributes(obj) <- c(attributes(obj),
888                         sasds=as.character(sasat$MEMNAME),
889                         sasdslabel=as.character(sasat$MEMLABEL))
890  }
891
892  obj
893}
894
895upData <- function(object, ...,
896                   rename=NULL, drop=NULL,
897                   labels=NULL, units=NULL, levels=NULL,
898                   force.single=TRUE, lowernames=FALSE, caplabels=FALSE,
899                   moveUnits=FALSE, charfactor=FALSE) {
900
901  upfirst <- function(txt) gsub("(\\w)(\\w*)", "\\U\\1\\L\\2", txt, perl=TRUE)
902
903  n  <- nrow(object)
904  if(!length(n)) {
905    x <- object[[1]]
906    d <- dim(x)
907    n <- if(length(d)) d[1]
908         else length(x)
909  }
910
911  rnames <- row.names(object)
912
913  if(lowernames)
914    names(object) <- casefold(names(object))
915  no <- names(object)
916
917  cat('Input object size:\t',object.size(object),'bytes;\t',
918      length(no),'variables\n')
919
920  ## The following is targeted at R workspaces exported from StatTransfer
921  al <- attr(object, 'var.labels')
922  if(length(al)) {
923    if(caplabels) al <- upfirst(al)
924    for(i in 1:length(no))
925      if(al[i] != '') label(object[[i]]) <- al[i]
926    attr(object, 'var.labels') <- NULL
927    if(missing(force.single)) force.single <- FALSE
928  } else if(caplabels) {
929    for(i in 1:length(no))
930      if(length(la <- attr(object[[i]], 'label')))
931        attr(object[[i]], 'label') <- upfirst(la)
932  }
933
934  if(moveUnits)
935    for(i in 1:length(no)) {
936      z <- object[[i]]
937      lab <- attr(z,'label')
938      if(!length(lab) || length(attr(z,'units')))
939        next
940
941      paren <- length(grep('\\(.*\\)',lab))
942      brack <- length(grep('\\[.*\\]',lab))
943      if(paren+brack == 0)
944        next
945
946      cat('Label for',no[i],'changed from',lab,'to ')
947      u <- if(paren)regexpr('\\(.*\\)',lab)
948           else regexpr('\\[.*\\]',lab)
949
950      len <- attr(u,'match.length')
951      un <- substring(lab, u+1, u+len-2)
952      lab <- substring(lab, 1, u-1)
953      if(substring(lab, nchar(lab), nchar(lab)) == ' ')
954        lab <- substring(lab, 1, nchar(lab)-1) # added 2nd char above 8jun03
955
956      cat(lab,'\n\tunits set to ',un,'\n',sep='')
957      attr(z,'label') <- lab
958      attr(z,'units') <- un
959      object[[i]] <- z
960    }
961
962  if(length(rename)) {
963    nr <- names(rename)
964    if(length(nr)==0 || any(nr==''))
965      stop('the list or vector specified in rename must specify variable names')
966
967    for(i in 1:length(rename)) {
968      if(nr[i] %nin% no)
969        stop(paste('unknown variable name:',nr[i]))
970
971      cat('Renamed variable\t', nr[i], '\tto', rename[[i]], '\n')
972    }
973
974    no[match(nr, no)] <- unlist(rename)
975    names(object) <- no
976  }
977
978  z <- substitute(list(...))
979
980  if(length(z) > 1) {
981    z <- z[-1]
982    vn <- names(z)
983    if(!length(vn) || any(vn==''))
984      stop('variables must all have names')
985
986    for(i in 1:length(z)) {
987      v <- vn[i]
988      if(v %in% no)
989        cat('Modified variable\t',v,'\n')
990      else {
991        cat('Added variable\t\t', v,'\n')
992        no <- c(no, v)
993      }
994
995      x <- eval(z[[i]], object)
996      d <- dim(x)
997      lx <- if(length(d))d[1]
998            else length(x)
999
1000      if(lx != n) {
1001        if(lx == 1)
1002          warning(paste('length of ',v,
1003                        ' is 1; will replicate this value.',sep=''))
1004        else {
1005          f <- find(v)
1006          if(length(f))cat('Variable',v,'found in',
1007                           paste(f,collapse=' '),'\n')
1008
1009          stop(paste('length of ',v,' (',lx, ')\n',
1010                     'does not match number of rows in object (',
1011                     n,')',sep=''))
1012        }
1013      }
1014
1015      ## If x is factor and is all NA, user probably miscoded. Add
1016      ## msg.
1017      if(is.factor(x) && all(is.na(x)))
1018        warning(paste('Variable ',v,'is a factor with all values NA.\n',
1019                      'Check that the second argument to factor() matched the original levels.\n',
1020                      sep=''))
1021
1022      object[[v]] <- x
1023    }
1024  }
1025
1026  if(force.single) {
1027    sm <- sapply(object, storage.mode)
1028    if(any(sm=='double'))
1029      for(i in 1:length(sm)) {   # 28Mar01
1030        if(sm[i]=='double') {
1031          x <- object[[i]]
1032          if(testDateTime(x))
1033            next   ## 31aug02
1034
1035          if(all(is.na(x)))
1036            storage.mode(object[[i]]) <- 'integer'
1037          else {
1038            notfractional <- !any(floor(x) != x, na.rm=TRUE)  ## 28Mar01
1039            ## max(abs()) 22apr03
1040            if(notfractional && max(abs(x),na.rm=TRUE) <= (2^31-1))
1041              storage.mode(object[[i]]) <- 'integer'
1042          }
1043        }
1044      }
1045  }
1046
1047  if(charfactor) {
1048    g <- function(z) {
1049      if(!is.character(z)) return(FALSE)
1050      length(unique(z)) < .5*length(z)
1051    }
1052    mfact <- sapply(object, g)
1053    if(any(mfact))
1054      for(i in (1:length(mfact))[mfact]) {
1055        x <- sub(' +$', '', object[[i]])  # remove trailing blanks
1056        object[[i]] <- factor(x, exclude='')
1057      }
1058  }
1059
1060  if(length(drop)) {
1061    if(length(drop)==1)
1062      cat('Dropped variable\t',drop,'\n')
1063    else
1064      cat('Dropped variables\t',paste(drop,collapse=','),'\n')
1065
1066    s <- drop %nin% no
1067    if(any(s))
1068      warning(paste('The following variables in drop= are not in object:',
1069                    paste(drop[s],collapse=' ')))
1070
1071    no <- no[no %nin% drop]
1072    object <- object[no]
1073  }
1074
1075  if(length(levels)) {
1076    if(!is.list(levels))
1077      stop('levels must be a list')
1078
1079    nl <- names(levels)
1080    s <- nl %nin% no
1081    if(any(s)) {
1082      warning(paste('The following variables in levels= are not in object:',
1083                    paste(nl[s],collapse=' ')))
1084      nl <- nl[!s]
1085    }
1086
1087    for(n in nl) {
1088      if(!is.factor(object[[n]]))
1089        object[[n]] <- as.factor(object[[n]])
1090
1091      levels(object[[n]]) <- levels[[n]]
1092      ## levels[[nn]] will usually be a list; S+ invokes merge.levels
1093    }
1094  }
1095
1096  if(length(labels)) {
1097    nl <- names(labels)
1098    if(!length(nl)) stop('elements of labels were unnamed')
1099    s <- nl %nin% no
1100    if(any(s)) {
1101      warning(paste('The following variables in labels= are not in object:',
1102                    paste(nl[s], collapse=' ')))
1103      nl <- nl[!s]
1104    }
1105
1106    for(n in nl)
1107      label(object[[n]]) <- labels[[n]]
1108  }
1109
1110  if(length(units)) {
1111    ##if(!is.list(units))stop('units must be a list')
1112    nu <- names(units)
1113    s <- nu %nin% no
1114    if(any(s)) {
1115      warning(paste('The following variables in units= are not in object:',
1116                    paste(nu[s], collapse=' ')))
1117      nu <- nu[!s]
1118    }
1119    for(n in nu)
1120      attr(object[[n]],'units') <- units[[n]]
1121  }
1122
1123  cat('New object size:\t',object.size(object),'bytes;\t',
1124      length(no),'variables\n')
1125  ## if(.R.) object <- structure(object, class='data.frame', row.names=rnames)
1126    object
1127  }
1128
1129dataframeReduce <- function(data, fracmiss=1, maxlevels=NULL,
1130                            minprev=0, pr=TRUE)
1131  {
1132    g <- function(x, fracmiss, maxlevels, minprev)
1133      {
1134        if(is.matrix(x))
1135          {
1136            f <- mean(is.na(x %*% rep(1,ncol(x))))
1137            return(if(f > fracmiss)
1138                   paste('fraction missing>',fracmiss,sep='') else '')
1139          }
1140        h <- function(a, b)
1141          if(a=='') b else if(b=='') a else paste(a, b, sep=';')
1142        f <- mean(is.na(x))
1143        x <- x[!is.na(x)]
1144        n <- length(x)
1145        r <- if(f > fracmiss)
1146          paste('fraction missing>',fracmiss,sep='') else ''
1147        if(is.character(x)) x <- factor(x)
1148        if(length(maxlevels) && is.category(x) &&
1149           length(levels(x)) > maxlevels)
1150          return(h(r, paste('categories>',maxlevels,sep='')))
1151        s <- ''
1152        if(is.category(x) || length(unique(x))==2)
1153          {
1154            tab <- table(x)
1155            if((min(tab) / n) < minprev)
1156              {
1157                if(is.category(x))
1158                  {
1159                    x <- combine.levels(x, minlev=minprev)
1160                    s <- 'grouped categories'
1161                    if(length(levels(x)) < 2)
1162                      s <- paste('prevalence<', minprev, sep='')
1163                  }
1164                else s <- paste('prevalence<', minprev, sep='')
1165              }
1166          }
1167        h(r, s)
1168      }
1169    h <- sapply(data, g, fracmiss, maxlevels, minprev)
1170    if(all(h=='')) return(data)
1171    if(pr)
1172      {
1173        cat('\nVariables Removed or Modified\n\n')
1174        print(data.frame(Variable=names(data)[h!=''],
1175                         Reason=h[h!=''], row.names=NULL, check.names=FALSE))
1176        cat('\n')
1177      }
1178    s <- h=='grouped categories'
1179    if(any(s)) for(i in which(s))
1180      data[[i]] <- combine.levels(data[[i]], minlev=minprev)
1181    if(any(h != '' & !s)) data <- data[h=='' | s]
1182    data
1183  }
1184
1185spss.get <- function(file, lowernames=FALSE,
1186                     datevars=NULL,
1187                     use.value.labels=TRUE,
1188                     to.data.frame=TRUE,
1189                     max.value.labels=Inf,
1190                     force.single=TRUE, allow=NULL, charfactor=FALSE)
1191  {
1192    require('foreign')
1193    if(length(grep('http://', file))) {
1194      tf <- tempfile()
1195      download.file(file, tf, mode='wb', quiet=TRUE)
1196      file <- tf
1197    }
1198
1199    w <- read.spss(file, use.value.labels=use.value.labels,
1200                   to.data.frame=to.data.frame,
1201                   max.value.labels=max.value.labels)
1202
1203    a   <- attributes(w)
1204    vl  <- a$variable.labels
1205    nam <- a$names
1206    nam <- makeNames(a$names, unique=TRUE, allow=allow)
1207    if(lowernames) nam <- casefold(nam)
1208    names(w) <- nam
1209
1210    lnam <- names(vl)
1211    if(length(vl))
1212      for(i in 1:length(vl)) {
1213        n <- lnam[i]
1214        lab <- vl[i]
1215        if(lab != '' && lab != n) label(w[[i]]) <- lab
1216      }
1217
1218    attr(w, 'variable.labels') <- NULL
1219    if(force.single || length(datevars) || charfactor)
1220      for(v in nam) {
1221        x <- w[[v]]
1222        changed <- FALSE
1223        if(v %in% datevars) {
1224          x <- importConvertDateTime(x, 'date', 'spss')
1225          changed <- TRUE
1226        } else if(all(is.na(x))) {
1227          storage.mode(x) <- 'integer'
1228          changed <- TRUE
1229        } else if(!(is.factor(x) || is.character(x))) {
1230          if(all(is.na(x))) {
1231            storage.mode(x) <- 'integer'
1232            changed <- TRUE
1233          } else if(max(abs(x),na.rm=TRUE) <= (2^31-1) &&
1234                    all(floor(x) == x, na.rm=TRUE)) {
1235            storage.mode(x) <- 'integer'
1236            changed <- TRUE
1237          }
1238        } else if(charfactor && is.character(x)) {
1239          if(length(unique(x)) < .5*length(x))
1240            {
1241              x <- sub(' +$', '', x)  # remove trailing blanks
1242              x <- factor(x, exclude='')
1243              changed <- TRUE
1244            }
1245        }
1246        if(changed) w[[v]] <- x
1247      }
1248
1249    w
1250  }
1251
1252  sasxport.get <- function(file, force.single=TRUE,
1253                           method=c('read.xport','dataload','csv'),
1254                           formats=NULL, allow=NULL, out=NULL,
1255                           keep=NULL, drop=NULL, as.is=0.5, FUN=NULL)
1256  {
1257    method <- match.arg(method)
1258    if(length(out) && method!='csv')
1259      stop('out only applies to method="csv"')
1260
1261    if(method != 'csv')
1262      require('foreign') || stop('foreign package is not installed')
1263
1264    rootsoftware <- if(method=='dataload')'dataload'
1265                    else 'sas'
1266
1267    sasdateform <-
1268      toupper(c("date","mmddyy","yymmdd","ddmmyy","yyq","monyy",
1269                "julian","qtr","weekdate","weekdatx","weekday","month"))
1270    sastimeform     <- toupper(c("hhmm","hour","mmss","time"))
1271    sasdatetimeform <- toupper(c("datetime","tod"))
1272
1273    if(length(grep('http://', file))) {
1274      tf <- tempfile()
1275      download.file(file, tf, mode='wb', quiet=TRUE)
1276      file <- tf
1277    }
1278
1279    dsinfo <-
1280      if(method == 'csv') lookupSASContents(file)
1281      else lookup.xport(file)
1282
1283    whichds <-
1284      if(length(keep))
1285        keep
1286      else
1287        setdiff(names(dsinfo), c(drop,'_CONTENTS_','_contents_'))
1288
1289  ds <- switch(method,
1290               read.xport= read.xport(file),
1291               dataload  = read.xportDataload(file, whichds),
1292               csv       = if(!length(out))
1293                             readSAScsv(file, dsinfo, whichds))
1294
1295    if(method=='read.xport' && (length(keep) | length(drop)))
1296      ds <- ds[whichds]
1297
1298    ## PROC FORMAT CNTLOUT= dataset present?
1299    fds <- NULL
1300    if(!length(formats)) {
1301      fds <- sapply(dsinfo, function(x)
1302                    all(c('FMTNAME','START','END','MIN','MAX','FUZZ')
1303                        %in% x$name))
1304      fds <- names(fds)[fds]
1305      if(length(fds) > 1) {
1306        warning('transport file contains more than one PROC FORMAT CNTLOUT= dataset; using only the first')
1307        fds <- fds[1]
1308      }
1309    }
1310
1311    finfo <- NULL
1312    if(length(formats) || length(fds)) {
1313      finfo <-
1314        if(length(formats))
1315          formats
1316        else if(length(out))
1317          readSAScsv(file, dsinfo, fds)
1318        else ds[[fds]]
1319
1320      ## Remove leading $ from char format names
1321      ##  fmtname <- sub('^\\$','',as.character(finfo$FMTNAME))
1322      fmtname <- as.character(finfo$FMTNAME)
1323      finfo <- split(finfo[c('START','END','LABEL')], fmtname)
1324      finfo <- lapply(finfo,
1325                      function(f)
1326                      {
1327                        rb <- function(a)
1328                        {  # remove leading + trailing blanks
1329                          a <- sub('[[:space:]]+$', '', as.character(a))
1330                          sub('^[[:space:]]+', '', a)
1331                        }
1332
1333                        st <- rb(f$START)
1334                        en <- rb(f$END)
1335                        lab <- rb(f$LABEL)
1336                        ##j <- is.na(st) | is.na(en)
1337                        ##  st %in% c('','.','NA') | en %in% c('','.','NA')
1338                        j <- is.na(st) | is.na(en) | st == '' | en == ''
1339                        if(any(j)) {
1340                          warning('NA in code in FORMAT definition; removed')
1341                          st <- st[!j]; en <- en[!j]; lab <- lab[!j]
1342                        }
1343
1344                        if(!all(st==en))
1345                          return(NULL)
1346
1347                        list(value = all.is.numeric(st, 'vector'),
1348                             label = lab)
1349                      })
1350    }
1351
1352    ## Number of non-format datasets
1353    nods <- length(whichds)
1354    nds  <- nods - (length(formats) == 0 && length(finfo) > 0)
1355    which.regular <- setdiff(whichds, fds)
1356    dsn <- tolower(which.regular)
1357
1358    if((nds > 1) && !length(out)) {
1359      res <- vector('list', nds)
1360      names(res) <- gsub('_','.',dsn)
1361    }
1362
1363    if(length(FUN)) {
1364      funout <- vector('list', length(dsn))
1365      names(funout) <- gsub('_','.',dsn)
1366    }
1367    possiblyConvertChar <- if(method=='read.xport')
1368      (is.logical(as.is) && as.is)  ||
1369    (is.numeric(as.is) && as.is < 1) else
1370      (is.logical(as.is) && !as.is) ||
1371    (is.numeric(as.is) && as.is > 0)
1372    ## reverse logic because read.xport always converts characters to factors
1373    j <- 0
1374    for(k in which.regular) {
1375      j   <- j + 1
1376      cat('Processing SAS dataset', k, '\t ')
1377      w   <-
1378        if(length(out))
1379          readSAScsv(file, dsinfo, k)
1380        else if(nods==1)
1381          ds
1382        else ds[[k]]
1383
1384      cat('.')
1385      if(!length(w)) {
1386        cat('Empty dataset', k, 'ignored\n')
1387        next
1388      }
1389
1390      nam      <- tolower(makeNames(names(w), allow=allow))
1391      names(w) <- nam
1392      dinfo    <- dsinfo[[k]]
1393      fmt      <- sub('^\\$','',dinfo$format)
1394      lab      <- dinfo$label
1395      ndinfo   <- tolower(makeNames(dinfo$name, allow=allow))
1396      names(lab) <- names(fmt) <- ndinfo
1397      for(i in 1:length(w)) {
1398        changed <- FALSE
1399        x  <- w[[i]]
1400        fi <- fmt[nam[i]]; names(fi) <- NULL
1401        if(fi != '' && length(finfo) && (fi %in% names(finfo))) {
1402          f <- finfo[[fi]]
1403          if(length(f)) {  ## may be NULL because had a range in format
1404            x <- factor(x, f$value, f$label)
1405            attr(x, 'format') <- fi
1406            changed <- TRUE
1407          }
1408        }
1409        if(is.numeric(x)) {
1410          if(fi %in% sasdateform) {
1411            x <- importConvertDateTime(x, 'date', rootsoftware)
1412            changed <- TRUE
1413          } else if(fi %in% sastimeform) {
1414            x <- importConvertDateTime(x, 'time', rootsoftware)
1415            changed <- TRUE
1416          } else if(fi %in% sasdatetimeform) {
1417            x <- importConvertDateTime(x, 'datetime', rootsoftware)
1418            changed <- TRUE
1419          } else if(force.single) {
1420            if(all(is.na(x))) {
1421              storage.mode(x) <- 'integer'
1422              changed <- TRUE
1423            } else if(max(abs(x),na.rm=TRUE) <= (2^31-1) &&
1424                      all(floor(x) == x, na.rm=TRUE)) {
1425              storage.mode(x) <- 'integer'
1426              changed <- TRUE
1427            }
1428          }
1429        } else if(method=='read.xport' && possiblyConvertChar && is.factor(x)) {
1430          if((is.logical(as.is) && as.is) ||
1431             (is.numeric(as.is) && length(unique(x)) >= as.is*length(x))) {
1432            x <- as.character(x)
1433            changed <- TRUE
1434          }
1435        } else if(possiblyConvertChar && is.character(x)) {
1436          if((is.logical(as.is) && !as.is) ||
1437             (is.numeric(as.is) && length(unique(x)) < as.is*length(x))) {
1438            x <- factor(x, exclude='')
1439            changed <- TRUE
1440          }
1441        }
1442
1443        lz <- lab[nam[i]]
1444        if(lz != '') {
1445          names(lz) <- NULL
1446          label(x)  <- lz
1447          changed   <- TRUE
1448        }
1449
1450        if(changed)
1451          w[[i]] <- x
1452      }
1453
1454      cat('.\n')
1455      if(length(out)) {
1456        nam <- gsub('_','.',dsn[j])
1457        assign(nam, w)
1458        ## ugly, but a way to get actual data frame name into first
1459        ## argument of save( )
1460        eval(parse(text=paste('save(',nam,', file="',
1461                              paste(out, '/', nam,'.rda',sep=''),
1462                              '", compress=TRUE)',sep='')))
1463        if(length(FUN) && length(w))
1464          funout[[nam]] <- FUN(w)
1465
1466        remove(nam)
1467      } else if(nds > 1)
1468        res[[j]] <- w
1469    }
1470
1471    if(length(out)) {
1472      names(dsinfo) <- gsub('_','.',tolower(names(dsinfo)))
1473      if(length(FUN))
1474        attr(dsinfo, 'FUN') <- funout
1475
1476      invisible(dsinfo)
1477    } else if(nds > 1)
1478      res
1479    else w
1480  }
1481
1482  ## Use dataload program to create a structure like read.xport does
1483  read.xportDataload <- function(file, dsnames) {
1484    outf <- substring(tempfile(tmpdir=''),2)
1485    file.copy(file, paste(tempdir(),outf,sep='/'))
1486    curwd <- getwd()
1487    on.exit(setwd(curwd))
1488    setwd(tempdir())
1489    n <- length(dsnames)
1490    w <- vector('list', n); names(w) <- dsnames
1491    for(a in dsnames) {
1492      status <- system(paste('dataload', outf, 'zzzz.rda', a),
1493                       intern=FALSE)
1494      if(status==0) {
1495        load('zzzz.rda')
1496        names(zzzz) <- makeNames(names(zzzz))
1497        w[[a]] <- zzzz
1498      }
1499    }
1500
1501    w
1502  }
1503
1504  ## Read _contents_.csv and store it like lookup.xport output
1505  lookupSASContents <- function(sasdir)
1506  {
1507    w <- read.csv(paste(sasdir,'_contents_.csv',sep='/'), as.is=TRUE)
1508    z <- tapply(w$NOBS, w$MEMNAME, function(x)x[1])
1509    if(any(z == 0)) {
1510      cat('\nDatasets with 0 observations ignored:\n')
1511      print(names(z)[z == 0], quote=FALSE)
1512      w <- subset(w, NOBS > 0)
1513    }
1514
1515    w$TYPE <- ifelse(w$TYPE==1, 'numeric', 'character')
1516    names(w) <- tolower(names(w))
1517    unclass(split(subset(w,select=-c(memname,memlabel)), w$memname))
1518  }
1519
1520  ## Read all SAS csv export files and store in a list
1521  readSAScsv <- function(sasdir, dsinfo, dsnames=names(dsinfo)) {
1522    sasnobs <- sapply(dsinfo, function(x)x$nobs[1])
1523    multi <- length(dsnames) > 1
1524    if(multi) {
1525      w <- vector('list', length(dsnames))
1526      names(w) <- dsnames
1527    }
1528
1529    for(a in dsnames) {
1530      z <- read.csv(paste(sasdir,'/',a,'.csv', sep=''),
1531                    as.is=TRUE, blank.lines.skip=FALSE,
1532                    comment.char="")
1533
1534      importedLength <- length(z[[1]])
1535      if(importedLength != sasnobs[a])
1536        cat('\nError: NOBS reported by SAS (',sasnobs[a],') for dataset ',
1537            a,' is not the same as imported length (', importedLength,
1538            ')\n', sep='')
1539
1540      if(multi)
1541        w[[a]] <- z
1542    }
1543
1544    if(multi)
1545      w
1546    else z
1547  }
1548
1549
1550
1551csv.get <- function(file, lowernames=FALSE, datevars=NULL, datetimevars=NULL,
1552                    dateformat='%F', fixdates=c('none','year'),
1553                    comment.char = "", autodates=TRUE, allow=NULL,
1554                    charfactor=FALSE,
1555                    sep=',', skip=0, vnames=NULL, labels=NULL, ...){
1556  fixdates <- match.arg(fixdates)
1557  if(length(vnames))
1558    vnames <- scan(file, what=character(0), skip=vnames-1, nlines=1,
1559                   sep=sep, quiet=TRUE)
1560  if(length(labels))
1561    labels <- scan(file, what=character(0), skip=labels-1, nlines=1,
1562                   sep=sep, quiet=TRUE)
1563
1564  w <- if(length(vnames))
1565    read.csv(file, check.names=FALSE, comment.char=comment.char,
1566             header=FALSE, col.names=vnames, skip=skip, sep=sep, ...)
1567  else read.csv(file, check.names=FALSE, comment.char=comment.char,
1568                sep=sep, skip=skip, ...)
1569  n <- nam <- names(w)
1570  m <- makeNames(n, unique=TRUE, allow=allow)
1571  if(length(labels)) n <- labels
1572  if(lowernames)
1573    m <- casefold(m)
1574
1575  changed <- any(m != nam)
1576  if(changed)
1577    names(w) <- m
1578
1579  if(autodates) {
1580    tmp <- w
1581    names(tmp) <- NULL
1582
1583    for(i in 1:length(tmp)) {
1584      if(! is.character(tmp[[1]]))
1585        next
1586    }
1587  }
1588  cleanup.import(w,
1589                 labels=if(length(labels))labels else if(changed)n else NULL,
1590                 datevars=datevars, datetimevars=datetimevars,
1591                 dateformat=dateformat,
1592                 fixdates=fixdates, charfactor=charfactor)
1593}
1594
1595
1596sasdsLabels <- function(file)
1597{
1598  w <- scan(file, sep='\n', what='', quiet=TRUE)
1599  i <- grep('Data Set Name:', w)
1600  if(!length(i))
1601    return(NULL)
1602
1603  n <- tolower(sub('.*\\.([A-Z0-9\\_]*)[[:space:]]+.*','\\1',w[i]))
1604  w <- gsub('\t','',w)
1605  labs <- ifelse(nchar(w[i-1])==0,w[i-2],w[i-1])
1606  names(labs) <- n
1607  labs
1608}
1609